tid_rdma.c (160823B)
1// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) 2/* 3 * Copyright(c) 2018 - 2020 Intel Corporation. 4 * 5 */ 6 7#include "hfi.h" 8#include "qp.h" 9#include "rc.h" 10#include "verbs.h" 11#include "tid_rdma.h" 12#include "exp_rcv.h" 13#include "trace.h" 14 15/** 16 * DOC: TID RDMA READ protocol 17 * 18 * This is an end-to-end protocol at the hfi1 level between two nodes that 19 * improves performance by avoiding data copy on the requester side. It 20 * converts a qualified RDMA READ request into a TID RDMA READ request on 21 * the requester side and thereafter handles the request and response 22 * differently. To be qualified, the RDMA READ request should meet the 23 * following: 24 * -- The total data length should be greater than 256K; 25 * -- The total data length should be a multiple of 4K page size; 26 * -- Each local scatter-gather entry should be 4K page aligned; 27 * -- Each local scatter-gather entry should be a multiple of 4K page size; 28 */ 29 30#define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32) 31#define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33) 32#define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34) 33#define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35) 34#define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37) 35#define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38) 36 37/* Maximum number of packets within a flow generation. */ 38#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT) 39 40#define GENERATION_MASK 0xFFFFF 41 42static u32 mask_generation(u32 a) 43{ 44 return a & GENERATION_MASK; 45} 46 47/* Reserved generation value to set to unused flows for kernel contexts */ 48#define KERN_GENERATION_RESERVED mask_generation(U32_MAX) 49 50/* 51 * J_KEY for kernel contexts when TID RDMA is used. 52 * See generate_jkey() in hfi.h for more information. 53 */ 54#define TID_RDMA_JKEY 32 55#define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE 56#define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1) 57 58/* Maximum number of segments in flight per QP request. */ 59#define TID_RDMA_MAX_READ_SEGS_PER_REQ 6 60#define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4 61#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \ 62 TID_RDMA_MAX_WRITE_SEGS_PER_REQ) 63#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1) 64 65#define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE) 66 67#define TID_RDMA_DESTQP_FLOW_SHIFT 11 68#define TID_RDMA_DESTQP_FLOW_MASK 0x1f 69 70#define TID_OPFN_QP_CTXT_MASK 0xff 71#define TID_OPFN_QP_CTXT_SHIFT 56 72#define TID_OPFN_QP_KDETH_MASK 0xff 73#define TID_OPFN_QP_KDETH_SHIFT 48 74#define TID_OPFN_MAX_LEN_MASK 0x7ff 75#define TID_OPFN_MAX_LEN_SHIFT 37 76#define TID_OPFN_TIMEOUT_MASK 0x1f 77#define TID_OPFN_TIMEOUT_SHIFT 32 78#define TID_OPFN_RESERVED_MASK 0x3f 79#define TID_OPFN_RESERVED_SHIFT 26 80#define TID_OPFN_URG_MASK 0x1 81#define TID_OPFN_URG_SHIFT 25 82#define TID_OPFN_VER_MASK 0x7 83#define TID_OPFN_VER_SHIFT 22 84#define TID_OPFN_JKEY_MASK 0x3f 85#define TID_OPFN_JKEY_SHIFT 16 86#define TID_OPFN_MAX_READ_MASK 0x3f 87#define TID_OPFN_MAX_READ_SHIFT 10 88#define TID_OPFN_MAX_WRITE_MASK 0x3f 89#define TID_OPFN_MAX_WRITE_SHIFT 4 90 91/* 92 * OPFN TID layout 93 * 94 * 63 47 31 15 95 * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC 96 * 3210987654321098 7654321098765432 1098765432109876 5432109876543210 97 * N - the context Number 98 * K - the Kdeth_qp 99 * M - Max_len 100 * T - Timeout 101 * D - reserveD 102 * V - version 103 * U - Urg capable 104 * J - Jkey 105 * R - max_Read 106 * W - max_Write 107 * C - Capcode 108 */ 109 110static void tid_rdma_trigger_resume(struct work_struct *work); 111static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req); 112static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, 113 gfp_t gfp); 114static void hfi1_init_trdma_req(struct rvt_qp *qp, 115 struct tid_rdma_request *req); 116static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx); 117static void hfi1_tid_timeout(struct timer_list *t); 118static void hfi1_add_tid_reap_timer(struct rvt_qp *qp); 119static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp); 120static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp); 121static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp); 122static void hfi1_tid_retry_timeout(struct timer_list *t); 123static int make_tid_rdma_ack(struct rvt_qp *qp, 124 struct ib_other_headers *ohdr, 125 struct hfi1_pkt_state *ps); 126static void hfi1_do_tid_send(struct rvt_qp *qp); 127static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx); 128static void tid_rdma_rcv_err(struct hfi1_packet *packet, 129 struct ib_other_headers *ohdr, 130 struct rvt_qp *qp, u32 psn, int diff, bool fecn); 131static void update_r_next_psn_fecn(struct hfi1_packet *packet, 132 struct hfi1_qp_priv *priv, 133 struct hfi1_ctxtdata *rcd, 134 struct tid_rdma_flow *flow, 135 bool fecn); 136 137static void validate_r_tid_ack(struct hfi1_qp_priv *priv) 138{ 139 if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) 140 priv->r_tid_ack = priv->r_tid_tail; 141} 142 143static void tid_rdma_schedule_ack(struct rvt_qp *qp) 144{ 145 struct hfi1_qp_priv *priv = qp->priv; 146 147 priv->s_flags |= RVT_S_ACK_PENDING; 148 hfi1_schedule_tid_send(qp); 149} 150 151static void tid_rdma_trigger_ack(struct rvt_qp *qp) 152{ 153 validate_r_tid_ack(qp->priv); 154 tid_rdma_schedule_ack(qp); 155} 156 157static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) 158{ 159 return 160 (((u64)p->qp & TID_OPFN_QP_CTXT_MASK) << 161 TID_OPFN_QP_CTXT_SHIFT) | 162 ((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) << 163 TID_OPFN_QP_KDETH_SHIFT) | 164 (((u64)((p->max_len >> PAGE_SHIFT) - 1) & 165 TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) | 166 (((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) << 167 TID_OPFN_TIMEOUT_SHIFT) | 168 (((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) | 169 (((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) | 170 (((u64)p->max_read & TID_OPFN_MAX_READ_MASK) << 171 TID_OPFN_MAX_READ_SHIFT) | 172 (((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) << 173 TID_OPFN_MAX_WRITE_SHIFT); 174} 175 176static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data) 177{ 178 p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) & 179 TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT; 180 p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK; 181 p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) & 182 TID_OPFN_MAX_WRITE_MASK; 183 p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) & 184 TID_OPFN_MAX_READ_MASK; 185 p->qp = 186 ((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK) 187 << 16) | 188 ((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK)); 189 p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK; 190 p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK; 191} 192 193void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p) 194{ 195 struct hfi1_qp_priv *priv = qp->priv; 196 197 p->qp = (RVT_KDETH_QP_PREFIX << 16) | priv->rcd->ctxt; 198 p->max_len = TID_RDMA_MAX_SEGMENT_SIZE; 199 p->jkey = priv->rcd->jkey; 200 p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ; 201 p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ; 202 p->timeout = qp->timeout; 203 p->urg = is_urg_masked(priv->rcd); 204} 205 206bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data) 207{ 208 struct hfi1_qp_priv *priv = qp->priv; 209 210 *data = tid_rdma_opfn_encode(&priv->tid_rdma.local); 211 return true; 212} 213 214bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data) 215{ 216 struct hfi1_qp_priv *priv = qp->priv; 217 struct tid_rdma_params *remote, *old; 218 bool ret = true; 219 220 old = rcu_dereference_protected(priv->tid_rdma.remote, 221 lockdep_is_held(&priv->opfn.lock)); 222 data &= ~0xfULL; 223 /* 224 * If data passed in is zero, return true so as not to continue the 225 * negotiation process 226 */ 227 if (!data || !HFI1_CAP_IS_KSET(TID_RDMA)) 228 goto null; 229 /* 230 * If kzalloc fails, return false. This will result in: 231 * * at the requester a new OPFN request being generated to retry 232 * the negotiation 233 * * at the responder, 0 being returned to the requester so as to 234 * disable TID RDMA at both the requester and the responder 235 */ 236 remote = kzalloc(sizeof(*remote), GFP_ATOMIC); 237 if (!remote) { 238 ret = false; 239 goto null; 240 } 241 242 tid_rdma_opfn_decode(remote, data); 243 priv->tid_timer_timeout_jiffies = 244 usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) / 245 1000UL) << 3) * 7); 246 trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local); 247 trace_hfi1_opfn_param(qp, 1, remote); 248 rcu_assign_pointer(priv->tid_rdma.remote, remote); 249 /* 250 * A TID RDMA READ request's segment size is not equal to 251 * remote->max_len only when the request's data length is smaller 252 * than remote->max_len. In that case, there will be only one segment. 253 * Therefore, when priv->pkts_ps is used to calculate req->cur_seg 254 * during retry, it will lead to req->cur_seg = 0, which is exactly 255 * what is expected. 256 */ 257 priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len); 258 priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1; 259 goto free; 260null: 261 RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); 262 priv->timeout_shift = 0; 263free: 264 if (old) 265 kfree_rcu(old, rcu_head); 266 return ret; 267} 268 269bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data) 270{ 271 bool ret; 272 273 ret = tid_rdma_conn_reply(qp, *data); 274 *data = 0; 275 /* 276 * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate 277 * TID RDMA could not be enabled. This will result in TID RDMA being 278 * disabled at the requester too. 279 */ 280 if (ret) 281 (void)tid_rdma_conn_req(qp, data); 282 return ret; 283} 284 285void tid_rdma_conn_error(struct rvt_qp *qp) 286{ 287 struct hfi1_qp_priv *priv = qp->priv; 288 struct tid_rdma_params *old; 289 290 old = rcu_dereference_protected(priv->tid_rdma.remote, 291 lockdep_is_held(&priv->opfn.lock)); 292 RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); 293 if (old) 294 kfree_rcu(old, rcu_head); 295} 296 297/* This is called at context initialization time */ 298int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit) 299{ 300 if (reinit) 301 return 0; 302 303 BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY); 304 BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY); 305 rcd->jkey = TID_RDMA_JKEY; 306 hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey); 307 return hfi1_alloc_ctxt_rcv_groups(rcd); 308} 309 310/** 311 * qp_to_rcd - determine the receive context used by a qp 312 * @rdi: rvt dev struct 313 * @qp: the qp 314 * 315 * This routine returns the receive context associated 316 * with a a qp's qpn. 317 * 318 * Returns the context. 319 */ 320static struct hfi1_ctxtdata *qp_to_rcd(struct rvt_dev_info *rdi, 321 struct rvt_qp *qp) 322{ 323 struct hfi1_ibdev *verbs_dev = container_of(rdi, 324 struct hfi1_ibdev, 325 rdi); 326 struct hfi1_devdata *dd = container_of(verbs_dev, 327 struct hfi1_devdata, 328 verbs_dev); 329 unsigned int ctxt; 330 331 if (qp->ibqp.qp_num == 0) 332 ctxt = 0; 333 else 334 ctxt = hfi1_get_qp_map(dd, qp->ibqp.qp_num >> dd->qos_shift); 335 return dd->rcd[ctxt]; 336} 337 338int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, 339 struct ib_qp_init_attr *init_attr) 340{ 341 struct hfi1_qp_priv *qpriv = qp->priv; 342 int i, ret; 343 344 qpriv->rcd = qp_to_rcd(rdi, qp); 345 346 spin_lock_init(&qpriv->opfn.lock); 347 INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request); 348 INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume); 349 qpriv->flow_state.psn = 0; 350 qpriv->flow_state.index = RXE_NUM_TID_FLOWS; 351 qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; 352 qpriv->flow_state.generation = KERN_GENERATION_RESERVED; 353 qpriv->s_state = TID_OP(WRITE_RESP); 354 qpriv->s_tid_cur = HFI1_QP_WQE_INVALID; 355 qpriv->s_tid_head = HFI1_QP_WQE_INVALID; 356 qpriv->s_tid_tail = HFI1_QP_WQE_INVALID; 357 qpriv->rnr_nak_state = TID_RNR_NAK_INIT; 358 qpriv->r_tid_head = HFI1_QP_WQE_INVALID; 359 qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; 360 qpriv->r_tid_ack = HFI1_QP_WQE_INVALID; 361 qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; 362 atomic_set(&qpriv->n_requests, 0); 363 atomic_set(&qpriv->n_tid_requests, 0); 364 timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0); 365 timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0); 366 INIT_LIST_HEAD(&qpriv->tid_wait); 367 368 if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { 369 struct hfi1_devdata *dd = qpriv->rcd->dd; 370 371 qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES * 372 sizeof(*qpriv->pages), 373 GFP_KERNEL, dd->node); 374 if (!qpriv->pages) 375 return -ENOMEM; 376 for (i = 0; i < qp->s_size; i++) { 377 struct hfi1_swqe_priv *priv; 378 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); 379 380 priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, 381 dd->node); 382 if (!priv) 383 return -ENOMEM; 384 385 hfi1_init_trdma_req(qp, &priv->tid_req); 386 priv->tid_req.e.swqe = wqe; 387 wqe->priv = priv; 388 } 389 for (i = 0; i < rvt_max_atomic(rdi); i++) { 390 struct hfi1_ack_priv *priv; 391 392 priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, 393 dd->node); 394 if (!priv) 395 return -ENOMEM; 396 397 hfi1_init_trdma_req(qp, &priv->tid_req); 398 priv->tid_req.e.ack = &qp->s_ack_queue[i]; 399 400 ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, 401 GFP_KERNEL); 402 if (ret) { 403 kfree(priv); 404 return ret; 405 } 406 qp->s_ack_queue[i].priv = priv; 407 } 408 } 409 410 return 0; 411} 412 413void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) 414{ 415 struct hfi1_qp_priv *qpriv = qp->priv; 416 struct rvt_swqe *wqe; 417 u32 i; 418 419 if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { 420 for (i = 0; i < qp->s_size; i++) { 421 wqe = rvt_get_swqe_ptr(qp, i); 422 kfree(wqe->priv); 423 wqe->priv = NULL; 424 } 425 for (i = 0; i < rvt_max_atomic(rdi); i++) { 426 struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv; 427 428 if (priv) 429 hfi1_kern_exp_rcv_free_flows(&priv->tid_req); 430 kfree(priv); 431 qp->s_ack_queue[i].priv = NULL; 432 } 433 cancel_work_sync(&qpriv->opfn.opfn_work); 434 kfree(qpriv->pages); 435 qpriv->pages = NULL; 436 } 437} 438 439/* Flow and tid waiter functions */ 440/** 441 * DOC: lock ordering 442 * 443 * There are two locks involved with the queuing 444 * routines: the qp s_lock and the exp_lock. 445 * 446 * Since the tid space allocation is called from 447 * the send engine, the qp s_lock is already held. 448 * 449 * The allocation routines will get the exp_lock. 450 * 451 * The first_qp() call is provided to allow the head of 452 * the rcd wait queue to be fetched under the exp_lock and 453 * followed by a drop of the exp_lock. 454 * 455 * Any qp in the wait list will have the qp reference count held 456 * to hold the qp in memory. 457 */ 458 459/* 460 * return head of rcd wait list 461 * 462 * Must hold the exp_lock. 463 * 464 * Get a reference to the QP to hold the QP in memory. 465 * 466 * The caller must release the reference when the local 467 * is no longer being used. 468 */ 469static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd, 470 struct tid_queue *queue) 471 __must_hold(&rcd->exp_lock) 472{ 473 struct hfi1_qp_priv *priv; 474 475 lockdep_assert_held(&rcd->exp_lock); 476 priv = list_first_entry_or_null(&queue->queue_head, 477 struct hfi1_qp_priv, 478 tid_wait); 479 if (!priv) 480 return NULL; 481 rvt_get_qp(priv->owner); 482 return priv->owner; 483} 484 485/** 486 * kernel_tid_waiters - determine rcd wait 487 * @rcd: the receive context 488 * @queue: the queue to operate on 489 * @qp: the head of the qp being processed 490 * 491 * This routine will return false IFF 492 * the list is NULL or the head of the 493 * list is the indicated qp. 494 * 495 * Must hold the qp s_lock and the exp_lock. 496 * 497 * Return: 498 * false if either of the conditions below are satisfied: 499 * 1. The list is empty or 500 * 2. The indicated qp is at the head of the list and the 501 * HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags. 502 * true is returned otherwise. 503 */ 504static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd, 505 struct tid_queue *queue, struct rvt_qp *qp) 506 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) 507{ 508 struct rvt_qp *fqp; 509 bool ret = true; 510 511 lockdep_assert_held(&qp->s_lock); 512 lockdep_assert_held(&rcd->exp_lock); 513 fqp = first_qp(rcd, queue); 514 if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE))) 515 ret = false; 516 rvt_put_qp(fqp); 517 return ret; 518} 519 520/** 521 * dequeue_tid_waiter - dequeue the qp from the list 522 * @rcd: the receive context 523 * @queue: the queue to operate on 524 * @qp: the qp to remove the wait list 525 * 526 * This routine removes the indicated qp from the 527 * wait list if it is there. 528 * 529 * This should be done after the hardware flow and 530 * tid array resources have been allocated. 531 * 532 * Must hold the qp s_lock and the rcd exp_lock. 533 * 534 * It assumes the s_lock to protect the s_flags 535 * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag. 536 */ 537static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd, 538 struct tid_queue *queue, struct rvt_qp *qp) 539 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) 540{ 541 struct hfi1_qp_priv *priv = qp->priv; 542 543 lockdep_assert_held(&qp->s_lock); 544 lockdep_assert_held(&rcd->exp_lock); 545 if (list_empty(&priv->tid_wait)) 546 return; 547 list_del_init(&priv->tid_wait); 548 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; 549 queue->dequeue++; 550 rvt_put_qp(qp); 551} 552 553/** 554 * queue_qp_for_tid_wait - suspend QP on tid space 555 * @rcd: the receive context 556 * @queue: the queue to operate on 557 * @qp: the qp 558 * 559 * The qp is inserted at the tail of the rcd 560 * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set. 561 * 562 * Must hold the qp s_lock and the exp_lock. 563 */ 564static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd, 565 struct tid_queue *queue, struct rvt_qp *qp) 566 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) 567{ 568 struct hfi1_qp_priv *priv = qp->priv; 569 570 lockdep_assert_held(&qp->s_lock); 571 lockdep_assert_held(&rcd->exp_lock); 572 if (list_empty(&priv->tid_wait)) { 573 qp->s_flags |= HFI1_S_WAIT_TID_SPACE; 574 list_add_tail(&priv->tid_wait, &queue->queue_head); 575 priv->tid_enqueue = ++queue->enqueue; 576 rcd->dd->verbs_dev.n_tidwait++; 577 trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE); 578 rvt_get_qp(qp); 579 } 580} 581 582/** 583 * __trigger_tid_waiter - trigger tid waiter 584 * @qp: the qp 585 * 586 * This is a private entrance to schedule the qp 587 * assuming the caller is holding the qp->s_lock. 588 */ 589static void __trigger_tid_waiter(struct rvt_qp *qp) 590 __must_hold(&qp->s_lock) 591{ 592 lockdep_assert_held(&qp->s_lock); 593 if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE)) 594 return; 595 trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE); 596 hfi1_schedule_send(qp); 597} 598 599/** 600 * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp 601 * @qp: the qp 602 * 603 * trigger a schedule or a waiting qp in a deadlock 604 * safe manner. The qp reference is held prior 605 * to this call via first_qp(). 606 * 607 * If the qp trigger was already scheduled (!rval) 608 * the reference is dropped, otherwise the resume 609 * or the destroy cancel will dispatch the reference. 610 */ 611static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp) 612{ 613 struct hfi1_qp_priv *priv; 614 struct hfi1_ibport *ibp; 615 struct hfi1_pportdata *ppd; 616 struct hfi1_devdata *dd; 617 bool rval; 618 619 if (!qp) 620 return; 621 622 priv = qp->priv; 623 ibp = to_iport(qp->ibqp.device, qp->port_num); 624 ppd = ppd_from_ibp(ibp); 625 dd = dd_from_ibdev(qp->ibqp.device); 626 627 rval = queue_work_on(priv->s_sde ? 628 priv->s_sde->cpu : 629 cpumask_first(cpumask_of_node(dd->node)), 630 ppd->hfi1_wq, 631 &priv->tid_rdma.trigger_work); 632 if (!rval) 633 rvt_put_qp(qp); 634} 635 636/** 637 * tid_rdma_trigger_resume - field a trigger work request 638 * @work: the work item 639 * 640 * Complete the off qp trigger processing by directly 641 * calling the progress routine. 642 */ 643static void tid_rdma_trigger_resume(struct work_struct *work) 644{ 645 struct tid_rdma_qp_params *tr; 646 struct hfi1_qp_priv *priv; 647 struct rvt_qp *qp; 648 649 tr = container_of(work, struct tid_rdma_qp_params, trigger_work); 650 priv = container_of(tr, struct hfi1_qp_priv, tid_rdma); 651 qp = priv->owner; 652 spin_lock_irq(&qp->s_lock); 653 if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) { 654 spin_unlock_irq(&qp->s_lock); 655 hfi1_do_send(priv->owner, true); 656 } else { 657 spin_unlock_irq(&qp->s_lock); 658 } 659 rvt_put_qp(qp); 660} 661 662/* 663 * tid_rdma_flush_wait - unwind any tid space wait 664 * 665 * This is called when resetting a qp to 666 * allow a destroy or reset to get rid 667 * of any tid space linkage and reference counts. 668 */ 669static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue) 670 __must_hold(&qp->s_lock) 671{ 672 struct hfi1_qp_priv *priv; 673 674 if (!qp) 675 return; 676 lockdep_assert_held(&qp->s_lock); 677 priv = qp->priv; 678 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; 679 spin_lock(&priv->rcd->exp_lock); 680 if (!list_empty(&priv->tid_wait)) { 681 list_del_init(&priv->tid_wait); 682 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; 683 queue->dequeue++; 684 rvt_put_qp(qp); 685 } 686 spin_unlock(&priv->rcd->exp_lock); 687} 688 689void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp) 690 __must_hold(&qp->s_lock) 691{ 692 struct hfi1_qp_priv *priv = qp->priv; 693 694 _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue); 695 _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue); 696} 697 698/* Flow functions */ 699/** 700 * kern_reserve_flow - allocate a hardware flow 701 * @rcd: the context to use for allocation 702 * @last: the index of the preferred flow. Use RXE_NUM_TID_FLOWS to 703 * signify "don't care". 704 * 705 * Use a bit mask based allocation to reserve a hardware 706 * flow for use in receiving KDETH data packets. If a preferred flow is 707 * specified the function will attempt to reserve that flow again, if 708 * available. 709 * 710 * The exp_lock must be held. 711 * 712 * Return: 713 * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1 714 * On failure: -EAGAIN 715 */ 716static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last) 717 __must_hold(&rcd->exp_lock) 718{ 719 int nr; 720 721 /* Attempt to reserve the preferred flow index */ 722 if (last >= 0 && last < RXE_NUM_TID_FLOWS && 723 !test_and_set_bit(last, &rcd->flow_mask)) 724 return last; 725 726 nr = ffz(rcd->flow_mask); 727 BUILD_BUG_ON(RXE_NUM_TID_FLOWS >= 728 (sizeof(rcd->flow_mask) * BITS_PER_BYTE)); 729 if (nr > (RXE_NUM_TID_FLOWS - 1)) 730 return -EAGAIN; 731 set_bit(nr, &rcd->flow_mask); 732 return nr; 733} 734 735static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation, 736 u32 flow_idx) 737{ 738 u64 reg; 739 740 reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) | 741 RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK | 742 RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK | 743 RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK | 744 RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK | 745 RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK; 746 747 if (generation != KERN_GENERATION_RESERVED) 748 reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK; 749 750 write_uctxt_csr(rcd->dd, rcd->ctxt, 751 RCV_TID_FLOW_TABLE + 8 * flow_idx, reg); 752} 753 754static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) 755 __must_hold(&rcd->exp_lock) 756{ 757 u32 generation = rcd->flows[flow_idx].generation; 758 759 kern_set_hw_flow(rcd, generation, flow_idx); 760 return generation; 761} 762 763static u32 kern_flow_generation_next(u32 gen) 764{ 765 u32 generation = mask_generation(gen + 1); 766 767 if (generation == KERN_GENERATION_RESERVED) 768 generation = mask_generation(generation + 1); 769 return generation; 770} 771 772static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) 773 __must_hold(&rcd->exp_lock) 774{ 775 rcd->flows[flow_idx].generation = 776 kern_flow_generation_next(rcd->flows[flow_idx].generation); 777 kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx); 778} 779 780int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) 781{ 782 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; 783 struct tid_flow_state *fs = &qpriv->flow_state; 784 struct rvt_qp *fqp; 785 unsigned long flags; 786 int ret = 0; 787 788 /* The QP already has an allocated flow */ 789 if (fs->index != RXE_NUM_TID_FLOWS) 790 return ret; 791 792 spin_lock_irqsave(&rcd->exp_lock, flags); 793 if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp)) 794 goto queue; 795 796 ret = kern_reserve_flow(rcd, fs->last_index); 797 if (ret < 0) 798 goto queue; 799 fs->index = ret; 800 fs->last_index = fs->index; 801 802 /* Generation received in a RESYNC overrides default flow generation */ 803 if (fs->generation != KERN_GENERATION_RESERVED) 804 rcd->flows[fs->index].generation = fs->generation; 805 fs->generation = kern_setup_hw_flow(rcd, fs->index); 806 fs->psn = 0; 807 dequeue_tid_waiter(rcd, &rcd->flow_queue, qp); 808 /* get head before dropping lock */ 809 fqp = first_qp(rcd, &rcd->flow_queue); 810 spin_unlock_irqrestore(&rcd->exp_lock, flags); 811 812 tid_rdma_schedule_tid_wakeup(fqp); 813 return 0; 814queue: 815 queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp); 816 spin_unlock_irqrestore(&rcd->exp_lock, flags); 817 return -EAGAIN; 818} 819 820void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) 821{ 822 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; 823 struct tid_flow_state *fs = &qpriv->flow_state; 824 struct rvt_qp *fqp; 825 unsigned long flags; 826 827 if (fs->index >= RXE_NUM_TID_FLOWS) 828 return; 829 spin_lock_irqsave(&rcd->exp_lock, flags); 830 kern_clear_hw_flow(rcd, fs->index); 831 clear_bit(fs->index, &rcd->flow_mask); 832 fs->index = RXE_NUM_TID_FLOWS; 833 fs->psn = 0; 834 fs->generation = KERN_GENERATION_RESERVED; 835 836 /* get head before dropping lock */ 837 fqp = first_qp(rcd, &rcd->flow_queue); 838 spin_unlock_irqrestore(&rcd->exp_lock, flags); 839 840 if (fqp == qp) { 841 __trigger_tid_waiter(fqp); 842 rvt_put_qp(fqp); 843 } else { 844 tid_rdma_schedule_tid_wakeup(fqp); 845 } 846} 847 848void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd) 849{ 850 int i; 851 852 for (i = 0; i < RXE_NUM_TID_FLOWS; i++) { 853 rcd->flows[i].generation = mask_generation(prandom_u32()); 854 kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i); 855 } 856} 857 858/* TID allocation functions */ 859static u8 trdma_pset_order(struct tid_rdma_pageset *s) 860{ 861 u8 count = s->count; 862 863 return ilog2(count) + 1; 864} 865 866/** 867 * tid_rdma_find_phys_blocks_4k - get groups base on mr info 868 * @flow: overall info for a TID RDMA segment 869 * @pages: pointer to an array of page structs 870 * @npages: number of pages 871 * @list: page set array to return 872 * 873 * This routine returns the number of groups associated with 874 * the current sge information. This implementation is based 875 * on the expected receive find_phys_blocks() adjusted to 876 * use the MR information vs. the pfn. 877 * 878 * Return: 879 * the number of RcvArray entries 880 */ 881static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow, 882 struct page **pages, 883 u32 npages, 884 struct tid_rdma_pageset *list) 885{ 886 u32 pagecount, pageidx, setcount = 0, i; 887 void *vaddr, *this_vaddr; 888 889 if (!npages) 890 return 0; 891 892 /* 893 * Look for sets of physically contiguous pages in the user buffer. 894 * This will allow us to optimize Expected RcvArray entry usage by 895 * using the bigger supported sizes. 896 */ 897 vaddr = page_address(pages[0]); 898 trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr); 899 for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { 900 this_vaddr = i < npages ? page_address(pages[i]) : NULL; 901 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0, 902 this_vaddr); 903 /* 904 * If the vaddr's are not sequential, pages are not physically 905 * contiguous. 906 */ 907 if (this_vaddr != (vaddr + PAGE_SIZE)) { 908 /* 909 * At this point we have to loop over the set of 910 * physically contiguous pages and break them down it 911 * sizes supported by the HW. 912 * There are two main constraints: 913 * 1. The max buffer size is MAX_EXPECTED_BUFFER. 914 * If the total set size is bigger than that 915 * program only a MAX_EXPECTED_BUFFER chunk. 916 * 2. The buffer size has to be a power of two. If 917 * it is not, round down to the closes power of 918 * 2 and program that size. 919 */ 920 while (pagecount) { 921 int maxpages = pagecount; 922 u32 bufsize = pagecount * PAGE_SIZE; 923 924 if (bufsize > MAX_EXPECTED_BUFFER) 925 maxpages = 926 MAX_EXPECTED_BUFFER >> 927 PAGE_SHIFT; 928 else if (!is_power_of_2(bufsize)) 929 maxpages = 930 rounddown_pow_of_two(bufsize) >> 931 PAGE_SHIFT; 932 933 list[setcount].idx = pageidx; 934 list[setcount].count = maxpages; 935 trace_hfi1_tid_pageset(flow->req->qp, setcount, 936 list[setcount].idx, 937 list[setcount].count); 938 pagecount -= maxpages; 939 pageidx += maxpages; 940 setcount++; 941 } 942 pageidx = i; 943 pagecount = 1; 944 vaddr = this_vaddr; 945 } else { 946 vaddr += PAGE_SIZE; 947 pagecount++; 948 } 949 } 950 /* insure we always return an even number of sets */ 951 if (setcount & 1) 952 list[setcount++].count = 0; 953 return setcount; 954} 955 956/** 957 * tid_flush_pages - dump out pages into pagesets 958 * @list: list of pagesets 959 * @idx: pointer to current page index 960 * @pages: number of pages to dump 961 * @sets: current number of pagesset 962 * 963 * This routine flushes out accumuated pages. 964 * 965 * To insure an even number of sets the 966 * code may add a filler. 967 * 968 * This can happen with when pages is not 969 * a power of 2 or pages is a power of 2 970 * less than the maximum pages. 971 * 972 * Return: 973 * The new number of sets 974 */ 975 976static u32 tid_flush_pages(struct tid_rdma_pageset *list, 977 u32 *idx, u32 pages, u32 sets) 978{ 979 while (pages) { 980 u32 maxpages = pages; 981 982 if (maxpages > MAX_EXPECTED_PAGES) 983 maxpages = MAX_EXPECTED_PAGES; 984 else if (!is_power_of_2(maxpages)) 985 maxpages = rounddown_pow_of_two(maxpages); 986 list[sets].idx = *idx; 987 list[sets++].count = maxpages; 988 *idx += maxpages; 989 pages -= maxpages; 990 } 991 /* might need a filler */ 992 if (sets & 1) 993 list[sets++].count = 0; 994 return sets; 995} 996 997/** 998 * tid_rdma_find_phys_blocks_8k - get groups base on mr info 999 * @flow: overall info for a TID RDMA segment 1000 * @pages: pointer to an array of page structs 1001 * @npages: number of pages 1002 * @list: page set array to return 1003 * 1004 * This routine parses an array of pages to compute pagesets 1005 * in an 8k compatible way. 1006 * 1007 * pages are tested two at a time, i, i + 1 for contiguous 1008 * pages and i - 1 and i contiguous pages. 1009 * 1010 * If any condition is false, any accumlated pages are flushed and 1011 * v0,v1 are emitted as separate PAGE_SIZE pagesets 1012 * 1013 * Otherwise, the current 8k is totaled for a future flush. 1014 * 1015 * Return: 1016 * The number of pagesets 1017 * list set with the returned number of pagesets 1018 * 1019 */ 1020static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow, 1021 struct page **pages, 1022 u32 npages, 1023 struct tid_rdma_pageset *list) 1024{ 1025 u32 idx, sets = 0, i; 1026 u32 pagecnt = 0; 1027 void *v0, *v1, *vm1; 1028 1029 if (!npages) 1030 return 0; 1031 for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) { 1032 /* get a new v0 */ 1033 v0 = page_address(pages[i]); 1034 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0); 1035 v1 = i + 1 < npages ? 1036 page_address(pages[i + 1]) : NULL; 1037 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1); 1038 /* compare i, i + 1 vaddr */ 1039 if (v1 != (v0 + PAGE_SIZE)) { 1040 /* flush out pages */ 1041 sets = tid_flush_pages(list, &idx, pagecnt, sets); 1042 /* output v0,v1 as two pagesets */ 1043 list[sets].idx = idx++; 1044 list[sets++].count = 1; 1045 if (v1) { 1046 list[sets].count = 1; 1047 list[sets++].idx = idx++; 1048 } else { 1049 list[sets++].count = 0; 1050 } 1051 vm1 = NULL; 1052 pagecnt = 0; 1053 continue; 1054 } 1055 /* i,i+1 consecutive, look at i-1,i */ 1056 if (vm1 && v0 != (vm1 + PAGE_SIZE)) { 1057 /* flush out pages */ 1058 sets = tid_flush_pages(list, &idx, pagecnt, sets); 1059 pagecnt = 0; 1060 } 1061 /* pages will always be a multiple of 8k */ 1062 pagecnt += 2; 1063 /* save i-1 */ 1064 vm1 = v1; 1065 /* move to next pair */ 1066 } 1067 /* dump residual pages at end */ 1068 sets = tid_flush_pages(list, &idx, npages - idx, sets); 1069 /* by design cannot be odd sets */ 1070 WARN_ON(sets & 1); 1071 return sets; 1072} 1073 1074/* 1075 * Find pages for one segment of a sge array represented by @ss. The function 1076 * does not check the sge, the sge must have been checked for alignment with a 1077 * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of 1078 * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge 1079 * copy maintained in @ss->sge, the original sge is not modified. 1080 * 1081 * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not 1082 * releasing the MR reference count at the same time. Otherwise, we'll "leak" 1083 * references to the MR. This difference requires that we keep track of progress 1084 * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request 1085 * structure. 1086 */ 1087static u32 kern_find_pages(struct tid_rdma_flow *flow, 1088 struct page **pages, 1089 struct rvt_sge_state *ss, bool *last) 1090{ 1091 struct tid_rdma_request *req = flow->req; 1092 struct rvt_sge *sge = &ss->sge; 1093 u32 length = flow->req->seg_len; 1094 u32 len = PAGE_SIZE; 1095 u32 i = 0; 1096 1097 while (length && req->isge < ss->num_sge) { 1098 pages[i++] = virt_to_page(sge->vaddr); 1099 1100 sge->vaddr += len; 1101 sge->length -= len; 1102 sge->sge_length -= len; 1103 if (!sge->sge_length) { 1104 if (++req->isge < ss->num_sge) 1105 *sge = ss->sg_list[req->isge - 1]; 1106 } else if (sge->length == 0 && sge->mr->lkey) { 1107 if (++sge->n >= RVT_SEGSZ) { 1108 ++sge->m; 1109 sge->n = 0; 1110 } 1111 sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; 1112 sge->length = sge->mr->map[sge->m]->segs[sge->n].length; 1113 } 1114 length -= len; 1115 } 1116 1117 flow->length = flow->req->seg_len - length; 1118 *last = req->isge != ss->num_sge; 1119 return i; 1120} 1121 1122static void dma_unmap_flow(struct tid_rdma_flow *flow) 1123{ 1124 struct hfi1_devdata *dd; 1125 int i; 1126 struct tid_rdma_pageset *pset; 1127 1128 dd = flow->req->rcd->dd; 1129 for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; 1130 i++, pset++) { 1131 if (pset->count && pset->addr) { 1132 dma_unmap_page(&dd->pcidev->dev, 1133 pset->addr, 1134 PAGE_SIZE * pset->count, 1135 DMA_FROM_DEVICE); 1136 pset->mapped = 0; 1137 } 1138 } 1139} 1140 1141static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages) 1142{ 1143 int i; 1144 struct hfi1_devdata *dd = flow->req->rcd->dd; 1145 struct tid_rdma_pageset *pset; 1146 1147 for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; 1148 i++, pset++) { 1149 if (pset->count) { 1150 pset->addr = dma_map_page(&dd->pcidev->dev, 1151 pages[pset->idx], 1152 0, 1153 PAGE_SIZE * pset->count, 1154 DMA_FROM_DEVICE); 1155 1156 if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) { 1157 dma_unmap_flow(flow); 1158 return -ENOMEM; 1159 } 1160 pset->mapped = 1; 1161 } 1162 } 1163 return 0; 1164} 1165 1166static inline bool dma_mapped(struct tid_rdma_flow *flow) 1167{ 1168 return !!flow->pagesets[0].mapped; 1169} 1170 1171/* 1172 * Get pages pointers and identify contiguous physical memory chunks for a 1173 * segment. All segments are of length flow->req->seg_len. 1174 */ 1175static int kern_get_phys_blocks(struct tid_rdma_flow *flow, 1176 struct page **pages, 1177 struct rvt_sge_state *ss, bool *last) 1178{ 1179 u8 npages; 1180 1181 /* Reuse previously computed pagesets, if any */ 1182 if (flow->npagesets) { 1183 trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, 1184 flow); 1185 if (!dma_mapped(flow)) 1186 return dma_map_flow(flow, pages); 1187 return 0; 1188 } 1189 1190 npages = kern_find_pages(flow, pages, ss, last); 1191 1192 if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096)) 1193 flow->npagesets = 1194 tid_rdma_find_phys_blocks_4k(flow, pages, npages, 1195 flow->pagesets); 1196 else 1197 flow->npagesets = 1198 tid_rdma_find_phys_blocks_8k(flow, pages, npages, 1199 flow->pagesets); 1200 1201 return dma_map_flow(flow, pages); 1202} 1203 1204static inline void kern_add_tid_node(struct tid_rdma_flow *flow, 1205 struct hfi1_ctxtdata *rcd, char *s, 1206 struct tid_group *grp, u8 cnt) 1207{ 1208 struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++]; 1209 1210 WARN_ON_ONCE(flow->tnode_cnt >= 1211 (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT)); 1212 if (WARN_ON_ONCE(cnt & 1)) 1213 dd_dev_err(rcd->dd, 1214 "unexpected odd allocation cnt %u map 0x%x used %u", 1215 cnt, grp->map, grp->used); 1216 1217 node->grp = grp; 1218 node->map = grp->map; 1219 node->cnt = cnt; 1220 trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1, 1221 grp->base, grp->map, grp->used, cnt); 1222} 1223 1224/* 1225 * Try to allocate pageset_count TID's from TID groups for a context 1226 * 1227 * This function allocates TID's without moving groups between lists or 1228 * modifying grp->map. This is done as follows, being cogizant of the lists 1229 * between which the TID groups will move: 1230 * 1. First allocate complete groups of 8 TID's since this is more efficient, 1231 * these groups will move from group->full without affecting used 1232 * 2. If more TID's are needed allocate from used (will move from used->full or 1233 * stay in used) 1234 * 3. If we still don't have the required number of TID's go back and look again 1235 * at a complete group (will move from group->used) 1236 */ 1237static int kern_alloc_tids(struct tid_rdma_flow *flow) 1238{ 1239 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1240 struct hfi1_devdata *dd = rcd->dd; 1241 u32 ngroups, pageidx = 0; 1242 struct tid_group *group = NULL, *used; 1243 u8 use; 1244 1245 flow->tnode_cnt = 0; 1246 ngroups = flow->npagesets / dd->rcv_entries.group_size; 1247 if (!ngroups) 1248 goto used_list; 1249 1250 /* First look at complete groups */ 1251 list_for_each_entry(group, &rcd->tid_group_list.list, list) { 1252 kern_add_tid_node(flow, rcd, "complete groups", group, 1253 group->size); 1254 1255 pageidx += group->size; 1256 if (!--ngroups) 1257 break; 1258 } 1259 1260 if (pageidx >= flow->npagesets) 1261 goto ok; 1262 1263used_list: 1264 /* Now look at partially used groups */ 1265 list_for_each_entry(used, &rcd->tid_used_list.list, list) { 1266 use = min_t(u32, flow->npagesets - pageidx, 1267 used->size - used->used); 1268 kern_add_tid_node(flow, rcd, "used groups", used, use); 1269 1270 pageidx += use; 1271 if (pageidx >= flow->npagesets) 1272 goto ok; 1273 } 1274 1275 /* 1276 * Look again at a complete group, continuing from where we left. 1277 * However, if we are at the head, we have reached the end of the 1278 * complete groups list from the first loop above 1279 */ 1280 if (group && &group->list == &rcd->tid_group_list.list) 1281 goto bail_eagain; 1282 group = list_prepare_entry(group, &rcd->tid_group_list.list, 1283 list); 1284 if (list_is_last(&group->list, &rcd->tid_group_list.list)) 1285 goto bail_eagain; 1286 group = list_next_entry(group, list); 1287 use = min_t(u32, flow->npagesets - pageidx, group->size); 1288 kern_add_tid_node(flow, rcd, "complete continue", group, use); 1289 pageidx += use; 1290 if (pageidx >= flow->npagesets) 1291 goto ok; 1292bail_eagain: 1293 trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ", 1294 (u64)flow->npagesets); 1295 return -EAGAIN; 1296ok: 1297 return 0; 1298} 1299 1300static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num, 1301 u32 *pset_idx) 1302{ 1303 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1304 struct hfi1_devdata *dd = rcd->dd; 1305 struct kern_tid_node *node = &flow->tnode[grp_num]; 1306 struct tid_group *grp = node->grp; 1307 struct tid_rdma_pageset *pset; 1308 u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT; 1309 u32 rcventry, npages = 0, pair = 0, tidctrl; 1310 u8 i, cnt = 0; 1311 1312 for (i = 0; i < grp->size; i++) { 1313 rcventry = grp->base + i; 1314 1315 if (node->map & BIT(i) || cnt >= node->cnt) { 1316 rcv_array_wc_fill(dd, rcventry); 1317 continue; 1318 } 1319 pset = &flow->pagesets[(*pset_idx)++]; 1320 if (pset->count) { 1321 hfi1_put_tid(dd, rcventry, PT_EXPECTED, 1322 pset->addr, trdma_pset_order(pset)); 1323 } else { 1324 hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); 1325 } 1326 npages += pset->count; 1327 1328 rcventry -= rcd->expected_base; 1329 tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1; 1330 /* 1331 * A single TID entry will be used to use a rcvarr pair (with 1332 * tidctrl 0x3), if ALL these are true (a) the bit pos is even 1333 * (b) the group map shows current and the next bits as free 1334 * indicating two consecutive rcvarry entries are available (c) 1335 * we actually need 2 more entries 1336 */ 1337 pair = !(i & 0x1) && !((node->map >> i) & 0x3) && 1338 node->cnt >= cnt + 2; 1339 if (!pair) { 1340 if (!pset->count) 1341 tidctrl = 0x1; 1342 flow->tid_entry[flow->tidcnt++] = 1343 EXP_TID_SET(IDX, rcventry >> 1) | 1344 EXP_TID_SET(CTRL, tidctrl) | 1345 EXP_TID_SET(LEN, npages); 1346 trace_hfi1_tid_entry_alloc(/* entry */ 1347 flow->req->qp, flow->tidcnt - 1, 1348 flow->tid_entry[flow->tidcnt - 1]); 1349 1350 /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */ 1351 flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg); 1352 npages = 0; 1353 } 1354 1355 if (grp->used == grp->size - 1) 1356 tid_group_move(grp, &rcd->tid_used_list, 1357 &rcd->tid_full_list); 1358 else if (!grp->used) 1359 tid_group_move(grp, &rcd->tid_group_list, 1360 &rcd->tid_used_list); 1361 1362 grp->used++; 1363 grp->map |= BIT(i); 1364 cnt++; 1365 } 1366} 1367 1368static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num) 1369{ 1370 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1371 struct hfi1_devdata *dd = rcd->dd; 1372 struct kern_tid_node *node = &flow->tnode[grp_num]; 1373 struct tid_group *grp = node->grp; 1374 u32 rcventry; 1375 u8 i, cnt = 0; 1376 1377 for (i = 0; i < grp->size; i++) { 1378 rcventry = grp->base + i; 1379 1380 if (node->map & BIT(i) || cnt >= node->cnt) { 1381 rcv_array_wc_fill(dd, rcventry); 1382 continue; 1383 } 1384 1385 hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); 1386 1387 grp->used--; 1388 grp->map &= ~BIT(i); 1389 cnt++; 1390 1391 if (grp->used == grp->size - 1) 1392 tid_group_move(grp, &rcd->tid_full_list, 1393 &rcd->tid_used_list); 1394 else if (!grp->used) 1395 tid_group_move(grp, &rcd->tid_used_list, 1396 &rcd->tid_group_list); 1397 } 1398 if (WARN_ON_ONCE(cnt & 1)) { 1399 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1400 struct hfi1_devdata *dd = rcd->dd; 1401 1402 dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u", 1403 cnt, grp->map, grp->used); 1404 } 1405} 1406 1407static void kern_program_rcvarray(struct tid_rdma_flow *flow) 1408{ 1409 u32 pset_idx = 0; 1410 int i; 1411 1412 flow->npkts = 0; 1413 flow->tidcnt = 0; 1414 for (i = 0; i < flow->tnode_cnt; i++) 1415 kern_program_rcv_group(flow, i, &pset_idx); 1416 trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow); 1417} 1418 1419/** 1420 * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a 1421 * TID RDMA request 1422 * 1423 * @req: TID RDMA request for which the segment/flow is being set up 1424 * @ss: sge state, maintains state across successive segments of a sge 1425 * @last: set to true after the last sge segment has been processed 1426 * 1427 * This function 1428 * (1) finds a free flow entry in the flow circular buffer 1429 * (2) finds pages and continuous physical chunks constituing one segment 1430 * of an sge 1431 * (3) allocates TID group entries for those chunks 1432 * (4) programs rcvarray entries in the hardware corresponding to those 1433 * TID's 1434 * (5) computes a tidarray with formatted TID entries which can be sent 1435 * to the sender 1436 * (6) Reserves and programs HW flows. 1437 * (7) It also manages queing the QP when TID/flow resources are not 1438 * available. 1439 * 1440 * @req points to struct tid_rdma_request of which the segments are a part. The 1441 * function uses qp, rcd and seg_len members of @req. In the absence of errors, 1442 * req->flow_idx is the index of the flow which has been prepared in this 1443 * invocation of function call. With flow = &req->flows[req->flow_idx], 1444 * flow->tid_entry contains the TID array which the sender can use for TID RDMA 1445 * sends and flow->npkts contains number of packets required to send the 1446 * segment. 1447 * 1448 * hfi1_check_sge_align should be called prior to calling this function and if 1449 * it signals error TID RDMA cannot be used for this sge and this function 1450 * should not be called. 1451 * 1452 * For the queuing, caller must hold the flow->req->qp s_lock from the send 1453 * engine and the function will procure the exp_lock. 1454 * 1455 * Return: 1456 * The function returns -EAGAIN if sufficient number of TID/flow resources to 1457 * map the segment could not be allocated. In this case the function should be 1458 * called again with previous arguments to retry the TID allocation. There are 1459 * no other error returns. The function returns 0 on success. 1460 */ 1461int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req, 1462 struct rvt_sge_state *ss, bool *last) 1463 __must_hold(&req->qp->s_lock) 1464{ 1465 struct tid_rdma_flow *flow = &req->flows[req->setup_head]; 1466 struct hfi1_ctxtdata *rcd = req->rcd; 1467 struct hfi1_qp_priv *qpriv = req->qp->priv; 1468 unsigned long flags; 1469 struct rvt_qp *fqp; 1470 u16 clear_tail = req->clear_tail; 1471 1472 lockdep_assert_held(&req->qp->s_lock); 1473 /* 1474 * We return error if either (a) we don't have space in the flow 1475 * circular buffer, or (b) we already have max entries in the buffer. 1476 * Max entries depend on the type of request we are processing and the 1477 * negotiated TID RDMA parameters. 1478 */ 1479 if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) || 1480 CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >= 1481 req->n_flows) 1482 return -EINVAL; 1483 1484 /* 1485 * Get pages, identify contiguous physical memory chunks for the segment 1486 * If we can not determine a DMA address mapping we will treat it just 1487 * like if we ran out of space above. 1488 */ 1489 if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) { 1490 hfi1_wait_kmem(flow->req->qp); 1491 return -ENOMEM; 1492 } 1493 1494 spin_lock_irqsave(&rcd->exp_lock, flags); 1495 if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp)) 1496 goto queue; 1497 1498 /* 1499 * At this point we know the number of pagesets and hence the number of 1500 * TID's to map the segment. Allocate the TID's from the TID groups. If 1501 * we cannot allocate the required number we exit and try again later 1502 */ 1503 if (kern_alloc_tids(flow)) 1504 goto queue; 1505 /* 1506 * Finally program the TID entries with the pagesets, compute the 1507 * tidarray and enable the HW flow 1508 */ 1509 kern_program_rcvarray(flow); 1510 1511 /* 1512 * Setup the flow state with relevant information. 1513 * This information is used for tracking the sequence of data packets 1514 * for the segment. 1515 * The flow is setup here as this is the most accurate time and place 1516 * to do so. Doing at a later time runs the risk of the flow data in 1517 * qpriv getting out of sync. 1518 */ 1519 memset(&flow->flow_state, 0x0, sizeof(flow->flow_state)); 1520 flow->idx = qpriv->flow_state.index; 1521 flow->flow_state.generation = qpriv->flow_state.generation; 1522 flow->flow_state.spsn = qpriv->flow_state.psn; 1523 flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1; 1524 flow->flow_state.r_next_psn = 1525 full_flow_psn(flow, flow->flow_state.spsn); 1526 qpriv->flow_state.psn += flow->npkts; 1527 1528 dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp); 1529 /* get head before dropping lock */ 1530 fqp = first_qp(rcd, &rcd->rarr_queue); 1531 spin_unlock_irqrestore(&rcd->exp_lock, flags); 1532 tid_rdma_schedule_tid_wakeup(fqp); 1533 1534 req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); 1535 return 0; 1536queue: 1537 queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp); 1538 spin_unlock_irqrestore(&rcd->exp_lock, flags); 1539 return -EAGAIN; 1540} 1541 1542static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow) 1543{ 1544 flow->npagesets = 0; 1545} 1546 1547/* 1548 * This function is called after one segment has been successfully sent to 1549 * release the flow and TID HW/SW resources for that segment. The segments for a 1550 * TID RDMA request are setup and cleared in FIFO order which is managed using a 1551 * circular buffer. 1552 */ 1553int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req) 1554 __must_hold(&req->qp->s_lock) 1555{ 1556 struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; 1557 struct hfi1_ctxtdata *rcd = req->rcd; 1558 unsigned long flags; 1559 int i; 1560 struct rvt_qp *fqp; 1561 1562 lockdep_assert_held(&req->qp->s_lock); 1563 /* Exit if we have nothing in the flow circular buffer */ 1564 if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) 1565 return -EINVAL; 1566 1567 spin_lock_irqsave(&rcd->exp_lock, flags); 1568 1569 for (i = 0; i < flow->tnode_cnt; i++) 1570 kern_unprogram_rcv_group(flow, i); 1571 /* To prevent double unprogramming */ 1572 flow->tnode_cnt = 0; 1573 /* get head before dropping lock */ 1574 fqp = first_qp(rcd, &rcd->rarr_queue); 1575 spin_unlock_irqrestore(&rcd->exp_lock, flags); 1576 1577 dma_unmap_flow(flow); 1578 1579 hfi1_tid_rdma_reset_flow(flow); 1580 req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1); 1581 1582 if (fqp == req->qp) { 1583 __trigger_tid_waiter(fqp); 1584 rvt_put_qp(fqp); 1585 } else { 1586 tid_rdma_schedule_tid_wakeup(fqp); 1587 } 1588 1589 return 0; 1590} 1591 1592/* 1593 * This function is called to release all the tid entries for 1594 * a request. 1595 */ 1596void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req) 1597 __must_hold(&req->qp->s_lock) 1598{ 1599 /* Use memory barrier for proper ordering */ 1600 while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) { 1601 if (hfi1_kern_exp_rcv_clear(req)) 1602 break; 1603 } 1604} 1605 1606/** 1607 * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information 1608 * @req: the tid rdma request to be cleaned 1609 */ 1610static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req) 1611{ 1612 kfree(req->flows); 1613 req->flows = NULL; 1614} 1615 1616/** 1617 * __trdma_clean_swqe - clean up for large sized QPs 1618 * @qp: the queue patch 1619 * @wqe: the send wqe 1620 */ 1621void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) 1622{ 1623 struct hfi1_swqe_priv *p = wqe->priv; 1624 1625 hfi1_kern_exp_rcv_free_flows(&p->tid_req); 1626} 1627 1628/* 1629 * This can be called at QP create time or in the data path. 1630 */ 1631static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, 1632 gfp_t gfp) 1633{ 1634 struct tid_rdma_flow *flows; 1635 int i; 1636 1637 if (likely(req->flows)) 1638 return 0; 1639 flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp, 1640 req->rcd->numa_id); 1641 if (!flows) 1642 return -ENOMEM; 1643 /* mini init */ 1644 for (i = 0; i < MAX_FLOWS; i++) { 1645 flows[i].req = req; 1646 flows[i].npagesets = 0; 1647 flows[i].pagesets[0].mapped = 0; 1648 flows[i].resync_npkts = 0; 1649 } 1650 req->flows = flows; 1651 return 0; 1652} 1653 1654static void hfi1_init_trdma_req(struct rvt_qp *qp, 1655 struct tid_rdma_request *req) 1656{ 1657 struct hfi1_qp_priv *qpriv = qp->priv; 1658 1659 /* 1660 * Initialize various TID RDMA request variables. 1661 * These variables are "static", which is why they 1662 * can be pre-initialized here before the WRs has 1663 * even been submitted. 1664 * However, non-NULL values for these variables do not 1665 * imply that this WQE has been enabled for TID RDMA. 1666 * Drivers should check the WQE's opcode to determine 1667 * if a request is a TID RDMA one or not. 1668 */ 1669 req->qp = qp; 1670 req->rcd = qpriv->rcd; 1671} 1672 1673u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, 1674 void *context, int vl, int mode, u64 data) 1675{ 1676 struct hfi1_devdata *dd = context; 1677 1678 return dd->verbs_dev.n_tidwait; 1679} 1680 1681static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req, 1682 u32 psn, u16 *fidx) 1683{ 1684 u16 head, tail; 1685 struct tid_rdma_flow *flow; 1686 1687 head = req->setup_head; 1688 tail = req->clear_tail; 1689 for ( ; CIRC_CNT(head, tail, MAX_FLOWS); 1690 tail = CIRC_NEXT(tail, MAX_FLOWS)) { 1691 flow = &req->flows[tail]; 1692 if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 && 1693 cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) { 1694 if (fidx) 1695 *fidx = tail; 1696 return flow; 1697 } 1698 } 1699 return NULL; 1700} 1701 1702/* TID RDMA READ functions */ 1703u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, 1704 struct ib_other_headers *ohdr, u32 *bth1, 1705 u32 *bth2, u32 *len) 1706{ 1707 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 1708 struct tid_rdma_flow *flow = &req->flows[req->flow_idx]; 1709 struct rvt_qp *qp = req->qp; 1710 struct hfi1_qp_priv *qpriv = qp->priv; 1711 struct hfi1_swqe_priv *wpriv = wqe->priv; 1712 struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req; 1713 struct tid_rdma_params *remote; 1714 u32 req_len = 0; 1715 void *req_addr = NULL; 1716 1717 /* This is the IB psn used to send the request */ 1718 *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt); 1719 trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow); 1720 1721 /* TID Entries for TID RDMA READ payload */ 1722 req_addr = &flow->tid_entry[flow->tid_idx]; 1723 req_len = sizeof(*flow->tid_entry) * 1724 (flow->tidcnt - flow->tid_idx); 1725 1726 memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req)); 1727 wpriv->ss.sge.vaddr = req_addr; 1728 wpriv->ss.sge.sge_length = req_len; 1729 wpriv->ss.sge.length = wpriv->ss.sge.sge_length; 1730 /* 1731 * We can safely zero these out. Since the first SGE covers the 1732 * entire packet, nothing else should even look at the MR. 1733 */ 1734 wpriv->ss.sge.mr = NULL; 1735 wpriv->ss.sge.m = 0; 1736 wpriv->ss.sge.n = 0; 1737 1738 wpriv->ss.sg_list = NULL; 1739 wpriv->ss.total_len = wpriv->ss.sge.sge_length; 1740 wpriv->ss.num_sge = 1; 1741 1742 /* Construct the TID RDMA READ REQ packet header */ 1743 rcu_read_lock(); 1744 remote = rcu_dereference(qpriv->tid_rdma.remote); 1745 1746 KDETH_RESET(rreq->kdeth0, KVER, 0x1); 1747 KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey); 1748 rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr + 1749 req->cur_seg * req->seg_len + flow->sent); 1750 rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey); 1751 rreq->reth.length = cpu_to_be32(*len); 1752 rreq->tid_flow_psn = 1753 cpu_to_be32((flow->flow_state.generation << 1754 HFI1_KDETH_BTH_SEQ_SHIFT) | 1755 ((flow->flow_state.spsn + flow->pkt) & 1756 HFI1_KDETH_BTH_SEQ_MASK)); 1757 rreq->tid_flow_qp = 1758 cpu_to_be32(qpriv->tid_rdma.local.qp | 1759 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << 1760 TID_RDMA_DESTQP_FLOW_SHIFT) | 1761 qpriv->rcd->ctxt); 1762 rreq->verbs_qp = cpu_to_be32(qp->remote_qpn); 1763 *bth1 &= ~RVT_QPN_MASK; 1764 *bth1 |= remote->qp; 1765 *bth2 |= IB_BTH_REQ_ACK; 1766 rcu_read_unlock(); 1767 1768 /* We are done with this segment */ 1769 flow->sent += *len; 1770 req->cur_seg++; 1771 qp->s_state = TID_OP(READ_REQ); 1772 req->ack_pending++; 1773 req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1); 1774 qpriv->pending_tid_r_segs++; 1775 qp->s_num_rd_atomic++; 1776 1777 /* Set the TID RDMA READ request payload size */ 1778 *len = req_len; 1779 1780 return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32); 1781} 1782 1783/* 1784 * @len: contains the data length to read upon entry and the read request 1785 * payload length upon exit. 1786 */ 1787u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, 1788 struct ib_other_headers *ohdr, u32 *bth1, 1789 u32 *bth2, u32 *len) 1790 __must_hold(&qp->s_lock) 1791{ 1792 struct hfi1_qp_priv *qpriv = qp->priv; 1793 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 1794 struct tid_rdma_flow *flow = NULL; 1795 u32 hdwords = 0; 1796 bool last; 1797 bool retry = true; 1798 u32 npkts = rvt_div_round_up_mtu(qp, *len); 1799 1800 trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn, 1801 wqe->lpsn, req); 1802 /* 1803 * Check sync conditions. Make sure that there are no pending 1804 * segments before freeing the flow. 1805 */ 1806sync_check: 1807 if (req->state == TID_REQUEST_SYNC) { 1808 if (qpriv->pending_tid_r_segs) 1809 goto done; 1810 1811 hfi1_kern_clear_hw_flow(req->rcd, qp); 1812 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 1813 req->state = TID_REQUEST_ACTIVE; 1814 } 1815 1816 /* 1817 * If the request for this segment is resent, the tid resources should 1818 * have been allocated before. In this case, req->flow_idx should 1819 * fall behind req->setup_head. 1820 */ 1821 if (req->flow_idx == req->setup_head) { 1822 retry = false; 1823 if (req->state == TID_REQUEST_RESEND) { 1824 /* 1825 * This is the first new segment for a request whose 1826 * earlier segments have been re-sent. We need to 1827 * set up the sge pointer correctly. 1828 */ 1829 restart_sge(&qp->s_sge, wqe, req->s_next_psn, 1830 qp->pmtu); 1831 req->isge = 0; 1832 req->state = TID_REQUEST_ACTIVE; 1833 } 1834 1835 /* 1836 * Check sync. The last PSN of each generation is reserved for 1837 * RESYNC. 1838 */ 1839 if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) { 1840 req->state = TID_REQUEST_SYNC; 1841 goto sync_check; 1842 } 1843 1844 /* Allocate the flow if not yet */ 1845 if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp)) 1846 goto done; 1847 1848 /* 1849 * The following call will advance req->setup_head after 1850 * allocating the tid entries. 1851 */ 1852 if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) { 1853 req->state = TID_REQUEST_QUEUED; 1854 1855 /* 1856 * We don't have resources for this segment. The QP has 1857 * already been queued. 1858 */ 1859 goto done; 1860 } 1861 } 1862 1863 /* req->flow_idx should only be one slot behind req->setup_head */ 1864 flow = &req->flows[req->flow_idx]; 1865 flow->pkt = 0; 1866 flow->tid_idx = 0; 1867 flow->sent = 0; 1868 if (!retry) { 1869 /* Set the first and last IB PSN for the flow in use.*/ 1870 flow->flow_state.ib_spsn = req->s_next_psn; 1871 flow->flow_state.ib_lpsn = 1872 flow->flow_state.ib_spsn + flow->npkts - 1; 1873 } 1874 1875 /* Calculate the next segment start psn.*/ 1876 req->s_next_psn += flow->npkts; 1877 1878 /* Build the packet header */ 1879 hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len); 1880done: 1881 return hdwords; 1882} 1883 1884/* 1885 * Validate and accept the TID RDMA READ request parameters. 1886 * Return 0 if the request is accepted successfully; 1887 * Return 1 otherwise. 1888 */ 1889static int tid_rdma_rcv_read_request(struct rvt_qp *qp, 1890 struct rvt_ack_entry *e, 1891 struct hfi1_packet *packet, 1892 struct ib_other_headers *ohdr, 1893 u32 bth0, u32 psn, u64 vaddr, u32 len) 1894{ 1895 struct hfi1_qp_priv *qpriv = qp->priv; 1896 struct tid_rdma_request *req; 1897 struct tid_rdma_flow *flow; 1898 u32 flow_psn, i, tidlen = 0, pktlen, tlen; 1899 1900 req = ack_to_tid_req(e); 1901 1902 /* Validate the payload first */ 1903 flow = &req->flows[req->setup_head]; 1904 1905 /* payload length = packet length - (header length + ICRC length) */ 1906 pktlen = packet->tlen - (packet->hlen + 4); 1907 if (pktlen > sizeof(flow->tid_entry)) 1908 return 1; 1909 memcpy(flow->tid_entry, packet->ebuf, pktlen); 1910 flow->tidcnt = pktlen / sizeof(*flow->tid_entry); 1911 1912 /* 1913 * Walk the TID_ENTRY list to make sure we have enough space for a 1914 * complete segment. Also calculate the number of required packets. 1915 */ 1916 flow->npkts = rvt_div_round_up_mtu(qp, len); 1917 for (i = 0; i < flow->tidcnt; i++) { 1918 trace_hfi1_tid_entry_rcv_read_req(qp, i, 1919 flow->tid_entry[i]); 1920 tlen = EXP_TID_GET(flow->tid_entry[i], LEN); 1921 if (!tlen) 1922 return 1; 1923 1924 /* 1925 * For tid pair (tidctr == 3), the buffer size of the pair 1926 * should be the sum of the buffer size described by each 1927 * tid entry. However, only the first entry needs to be 1928 * specified in the request (see WFR HAS Section 8.5.7.1). 1929 */ 1930 tidlen += tlen; 1931 } 1932 if (tidlen * PAGE_SIZE < len) 1933 return 1; 1934 1935 /* Empty the flow array */ 1936 req->clear_tail = req->setup_head; 1937 flow->pkt = 0; 1938 flow->tid_idx = 0; 1939 flow->tid_offset = 0; 1940 flow->sent = 0; 1941 flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp); 1942 flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) & 1943 TID_RDMA_DESTQP_FLOW_MASK; 1944 flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn)); 1945 flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT; 1946 flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK; 1947 flow->length = len; 1948 1949 flow->flow_state.lpsn = flow->flow_state.spsn + 1950 flow->npkts - 1; 1951 flow->flow_state.ib_spsn = psn; 1952 flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1; 1953 1954 trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow); 1955 /* Set the initial flow index to the current flow. */ 1956 req->flow_idx = req->setup_head; 1957 1958 /* advance circular buffer head */ 1959 req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); 1960 1961 /* 1962 * Compute last PSN for request. 1963 */ 1964 e->opcode = (bth0 >> 24) & 0xff; 1965 e->psn = psn; 1966 e->lpsn = psn + flow->npkts - 1; 1967 e->sent = 0; 1968 1969 req->n_flows = qpriv->tid_rdma.local.max_read; 1970 req->state = TID_REQUEST_ACTIVE; 1971 req->cur_seg = 0; 1972 req->comp_seg = 0; 1973 req->ack_seg = 0; 1974 req->isge = 0; 1975 req->seg_len = qpriv->tid_rdma.local.max_len; 1976 req->total_len = len; 1977 req->total_segs = 1; 1978 req->r_flow_psn = e->psn; 1979 1980 trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn, 1981 req); 1982 return 0; 1983} 1984 1985static int tid_rdma_rcv_error(struct hfi1_packet *packet, 1986 struct ib_other_headers *ohdr, 1987 struct rvt_qp *qp, u32 psn, int diff) 1988{ 1989 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 1990 struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd; 1991 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 1992 struct hfi1_qp_priv *qpriv = qp->priv; 1993 struct rvt_ack_entry *e; 1994 struct tid_rdma_request *req; 1995 unsigned long flags; 1996 u8 prev; 1997 bool old_req; 1998 1999 trace_hfi1_rsp_tid_rcv_error(qp, psn); 2000 trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff); 2001 if (diff > 0) { 2002 /* sequence error */ 2003 if (!qp->r_nak_state) { 2004 ibp->rvp.n_rc_seqnak++; 2005 qp->r_nak_state = IB_NAK_PSN_ERROR; 2006 qp->r_ack_psn = qp->r_psn; 2007 rc_defered_ack(rcd, qp); 2008 } 2009 goto done; 2010 } 2011 2012 ibp->rvp.n_rc_dupreq++; 2013 2014 spin_lock_irqsave(&qp->s_lock, flags); 2015 e = find_prev_entry(qp, psn, &prev, NULL, &old_req); 2016 if (!e || (e->opcode != TID_OP(READ_REQ) && 2017 e->opcode != TID_OP(WRITE_REQ))) 2018 goto unlock; 2019 2020 req = ack_to_tid_req(e); 2021 req->r_flow_psn = psn; 2022 trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req); 2023 if (e->opcode == TID_OP(READ_REQ)) { 2024 struct ib_reth *reth; 2025 u32 len; 2026 u32 rkey; 2027 u64 vaddr; 2028 int ok; 2029 u32 bth0; 2030 2031 reth = &ohdr->u.tid_rdma.r_req.reth; 2032 /* 2033 * The requester always restarts from the start of the original 2034 * request. 2035 */ 2036 len = be32_to_cpu(reth->length); 2037 if (psn != e->psn || len != req->total_len) 2038 goto unlock; 2039 2040 release_rdma_sge_mr(e); 2041 2042 rkey = be32_to_cpu(reth->rkey); 2043 vaddr = get_ib_reth_vaddr(reth); 2044 2045 qp->r_len = len; 2046 ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, 2047 IB_ACCESS_REMOTE_READ); 2048 if (unlikely(!ok)) 2049 goto unlock; 2050 2051 /* 2052 * If all the response packets for the current request have 2053 * been sent out and this request is complete (old_request 2054 * == false) and the TID flow may be unusable (the 2055 * req->clear_tail is advanced). However, when an earlier 2056 * request is received, this request will not be complete any 2057 * more (qp->s_tail_ack_queue is moved back, see below). 2058 * Consequently, we need to update the TID flow info everytime 2059 * a duplicate request is received. 2060 */ 2061 bth0 = be32_to_cpu(ohdr->bth[0]); 2062 if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, 2063 vaddr, len)) 2064 goto unlock; 2065 2066 /* 2067 * True if the request is already scheduled (between 2068 * qp->s_tail_ack_queue and qp->r_head_ack_queue); 2069 */ 2070 if (old_req) 2071 goto unlock; 2072 } else { 2073 struct flow_state *fstate; 2074 bool schedule = false; 2075 u8 i; 2076 2077 if (req->state == TID_REQUEST_RESEND) { 2078 req->state = TID_REQUEST_RESEND_ACTIVE; 2079 } else if (req->state == TID_REQUEST_INIT_RESEND) { 2080 req->state = TID_REQUEST_INIT; 2081 schedule = true; 2082 } 2083 2084 /* 2085 * True if the request is already scheduled (between 2086 * qp->s_tail_ack_queue and qp->r_head_ack_queue). 2087 * Also, don't change requests, which are at the SYNC 2088 * point and haven't generated any responses yet. 2089 * There is nothing to retransmit for them yet. 2090 */ 2091 if (old_req || req->state == TID_REQUEST_INIT || 2092 (req->state == TID_REQUEST_SYNC && !req->cur_seg)) { 2093 for (i = prev + 1; ; i++) { 2094 if (i > rvt_size_atomic(&dev->rdi)) 2095 i = 0; 2096 if (i == qp->r_head_ack_queue) 2097 break; 2098 e = &qp->s_ack_queue[i]; 2099 req = ack_to_tid_req(e); 2100 if (e->opcode == TID_OP(WRITE_REQ) && 2101 req->state == TID_REQUEST_INIT) 2102 req->state = TID_REQUEST_INIT_RESEND; 2103 } 2104 /* 2105 * If the state of the request has been changed, 2106 * the first leg needs to get scheduled in order to 2107 * pick up the change. Otherwise, normal response 2108 * processing should take care of it. 2109 */ 2110 if (!schedule) 2111 goto unlock; 2112 } 2113 2114 /* 2115 * If there is no more allocated segment, just schedule the qp 2116 * without changing any state. 2117 */ 2118 if (req->clear_tail == req->setup_head) 2119 goto schedule; 2120 /* 2121 * If this request has sent responses for segments, which have 2122 * not received data yet (flow_idx != clear_tail), the flow_idx 2123 * pointer needs to be adjusted so the same responses can be 2124 * re-sent. 2125 */ 2126 if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) { 2127 fstate = &req->flows[req->clear_tail].flow_state; 2128 qpriv->pending_tid_w_segs -= 2129 CIRC_CNT(req->flow_idx, req->clear_tail, 2130 MAX_FLOWS); 2131 req->flow_idx = 2132 CIRC_ADD(req->clear_tail, 2133 delta_psn(psn, fstate->resp_ib_psn), 2134 MAX_FLOWS); 2135 qpriv->pending_tid_w_segs += 2136 delta_psn(psn, fstate->resp_ib_psn); 2137 /* 2138 * When flow_idx == setup_head, we've gotten a duplicate 2139 * request for a segment, which has not been allocated 2140 * yet. In that case, don't adjust this request. 2141 * However, we still want to go through the loop below 2142 * to adjust all subsequent requests. 2143 */ 2144 if (CIRC_CNT(req->setup_head, req->flow_idx, 2145 MAX_FLOWS)) { 2146 req->cur_seg = delta_psn(psn, e->psn); 2147 req->state = TID_REQUEST_RESEND_ACTIVE; 2148 } 2149 } 2150 2151 for (i = prev + 1; ; i++) { 2152 /* 2153 * Look at everything up to and including 2154 * s_tail_ack_queue 2155 */ 2156 if (i > rvt_size_atomic(&dev->rdi)) 2157 i = 0; 2158 if (i == qp->r_head_ack_queue) 2159 break; 2160 e = &qp->s_ack_queue[i]; 2161 req = ack_to_tid_req(e); 2162 trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, 2163 e->lpsn, req); 2164 if (e->opcode != TID_OP(WRITE_REQ) || 2165 req->cur_seg == req->comp_seg || 2166 req->state == TID_REQUEST_INIT || 2167 req->state == TID_REQUEST_INIT_RESEND) { 2168 if (req->state == TID_REQUEST_INIT) 2169 req->state = TID_REQUEST_INIT_RESEND; 2170 continue; 2171 } 2172 qpriv->pending_tid_w_segs -= 2173 CIRC_CNT(req->flow_idx, 2174 req->clear_tail, 2175 MAX_FLOWS); 2176 req->flow_idx = req->clear_tail; 2177 req->state = TID_REQUEST_RESEND; 2178 req->cur_seg = req->comp_seg; 2179 } 2180 qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; 2181 } 2182 /* Re-process old requests.*/ 2183 if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) 2184 qp->s_acked_ack_queue = prev; 2185 qp->s_tail_ack_queue = prev; 2186 /* 2187 * Since the qp->s_tail_ack_queue is modified, the 2188 * qp->s_ack_state must be changed to re-initialize 2189 * qp->s_ack_rdma_sge; Otherwise, we will end up in 2190 * wrong memory region. 2191 */ 2192 qp->s_ack_state = OP(ACKNOWLEDGE); 2193schedule: 2194 /* 2195 * It's possible to receive a retry psn that is earlier than an RNRNAK 2196 * psn. In this case, the rnrnak state should be cleared. 2197 */ 2198 if (qpriv->rnr_nak_state) { 2199 qp->s_nak_state = 0; 2200 qpriv->rnr_nak_state = TID_RNR_NAK_INIT; 2201 qp->r_psn = e->lpsn + 1; 2202 hfi1_tid_write_alloc_resources(qp, true); 2203 } 2204 2205 qp->r_state = e->opcode; 2206 qp->r_nak_state = 0; 2207 qp->s_flags |= RVT_S_RESP_PENDING; 2208 hfi1_schedule_send(qp); 2209unlock: 2210 spin_unlock_irqrestore(&qp->s_lock, flags); 2211done: 2212 return 1; 2213} 2214 2215void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) 2216{ 2217 /* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/ 2218 2219 /* 2220 * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ 2221 * (see hfi1_rc_rcv()) 2222 * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue) 2223 * - Setup struct tid_rdma_req with request info 2224 * - Initialize struct tid_rdma_flow info; 2225 * - Copy TID entries; 2226 * 3. Set the qp->s_ack_state. 2227 * 4. Set RVT_S_RESP_PENDING in s_flags. 2228 * 5. Kick the send engine (hfi1_schedule_send()) 2229 */ 2230 struct hfi1_ctxtdata *rcd = packet->rcd; 2231 struct rvt_qp *qp = packet->qp; 2232 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 2233 struct ib_other_headers *ohdr = packet->ohdr; 2234 struct rvt_ack_entry *e; 2235 unsigned long flags; 2236 struct ib_reth *reth; 2237 struct hfi1_qp_priv *qpriv = qp->priv; 2238 u32 bth0, psn, len, rkey; 2239 bool fecn; 2240 u8 next; 2241 u64 vaddr; 2242 int diff; 2243 u8 nack_state = IB_NAK_INVALID_REQUEST; 2244 2245 bth0 = be32_to_cpu(ohdr->bth[0]); 2246 if (hfi1_ruc_check_hdr(ibp, packet)) 2247 return; 2248 2249 fecn = process_ecn(qp, packet); 2250 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 2251 trace_hfi1_rsp_rcv_tid_read_req(qp, psn); 2252 2253 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) 2254 rvt_comm_est(qp); 2255 2256 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) 2257 goto nack_inv; 2258 2259 reth = &ohdr->u.tid_rdma.r_req.reth; 2260 vaddr = be64_to_cpu(reth->vaddr); 2261 len = be32_to_cpu(reth->length); 2262 /* The length needs to be in multiples of PAGE_SIZE */ 2263 if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len) 2264 goto nack_inv; 2265 2266 diff = delta_psn(psn, qp->r_psn); 2267 if (unlikely(diff)) { 2268 tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn); 2269 return; 2270 } 2271 2272 /* We've verified the request, insert it into the ack queue. */ 2273 next = qp->r_head_ack_queue + 1; 2274 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 2275 next = 0; 2276 spin_lock_irqsave(&qp->s_lock, flags); 2277 if (unlikely(next == qp->s_tail_ack_queue)) { 2278 if (!qp->s_ack_queue[next].sent) { 2279 nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; 2280 goto nack_inv_unlock; 2281 } 2282 update_ack_queue(qp, next); 2283 } 2284 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 2285 release_rdma_sge_mr(e); 2286 2287 rkey = be32_to_cpu(reth->rkey); 2288 qp->r_len = len; 2289 2290 if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, 2291 rkey, IB_ACCESS_REMOTE_READ))) 2292 goto nack_acc; 2293 2294 /* Accept the request parameters */ 2295 if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr, 2296 len)) 2297 goto nack_inv_unlock; 2298 2299 qp->r_state = e->opcode; 2300 qp->r_nak_state = 0; 2301 /* 2302 * We need to increment the MSN here instead of when we 2303 * finish sending the result since a duplicate request would 2304 * increment it more than once. 2305 */ 2306 qp->r_msn++; 2307 qp->r_psn += e->lpsn - e->psn + 1; 2308 2309 qp->r_head_ack_queue = next; 2310 2311 /* 2312 * For all requests other than TID WRITE which are added to the ack 2313 * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to 2314 * do this because of interlocks between these and TID WRITE 2315 * requests. The same change has also been made in hfi1_rc_rcv(). 2316 */ 2317 qpriv->r_tid_alloc = qp->r_head_ack_queue; 2318 2319 /* Schedule the send tasklet. */ 2320 qp->s_flags |= RVT_S_RESP_PENDING; 2321 if (fecn) 2322 qp->s_flags |= RVT_S_ECN; 2323 hfi1_schedule_send(qp); 2324 2325 spin_unlock_irqrestore(&qp->s_lock, flags); 2326 return; 2327 2328nack_inv_unlock: 2329 spin_unlock_irqrestore(&qp->s_lock, flags); 2330nack_inv: 2331 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 2332 qp->r_nak_state = nack_state; 2333 qp->r_ack_psn = qp->r_psn; 2334 /* Queue NAK for later */ 2335 rc_defered_ack(rcd, qp); 2336 return; 2337nack_acc: 2338 spin_unlock_irqrestore(&qp->s_lock, flags); 2339 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); 2340 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; 2341 qp->r_ack_psn = qp->r_psn; 2342} 2343 2344u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, 2345 struct ib_other_headers *ohdr, u32 *bth0, 2346 u32 *bth1, u32 *bth2, u32 *len, bool *last) 2347{ 2348 struct hfi1_ack_priv *epriv = e->priv; 2349 struct tid_rdma_request *req = &epriv->tid_req; 2350 struct hfi1_qp_priv *qpriv = qp->priv; 2351 struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; 2352 u32 tidentry = flow->tid_entry[flow->tid_idx]; 2353 u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT; 2354 struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp; 2355 u32 next_offset, om = KDETH_OM_LARGE; 2356 bool last_pkt; 2357 u32 hdwords = 0; 2358 struct tid_rdma_params *remote; 2359 2360 *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset); 2361 flow->sent += *len; 2362 next_offset = flow->tid_offset + *len; 2363 last_pkt = (flow->sent >= flow->length); 2364 2365 trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry); 2366 trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow); 2367 2368 rcu_read_lock(); 2369 remote = rcu_dereference(qpriv->tid_rdma.remote); 2370 if (!remote) { 2371 rcu_read_unlock(); 2372 goto done; 2373 } 2374 KDETH_RESET(resp->kdeth0, KVER, 0x1); 2375 KDETH_SET(resp->kdeth0, SH, !last_pkt); 2376 KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg)); 2377 KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL)); 2378 KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX)); 2379 KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE); 2380 KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om); 2381 KDETH_RESET(resp->kdeth1, JKEY, remote->jkey); 2382 resp->verbs_qp = cpu_to_be32(qp->remote_qpn); 2383 rcu_read_unlock(); 2384 2385 resp->aeth = rvt_compute_aeth(qp); 2386 resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn + 2387 flow->pkt)); 2388 2389 *bth0 = TID_OP(READ_RESP) << 24; 2390 *bth1 = flow->tid_qpn; 2391 *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) & 2392 HFI1_KDETH_BTH_SEQ_MASK) | 2393 (flow->flow_state.generation << 2394 HFI1_KDETH_BTH_SEQ_SHIFT)); 2395 *last = last_pkt; 2396 if (last_pkt) 2397 /* Advance to next flow */ 2398 req->clear_tail = (req->clear_tail + 1) & 2399 (MAX_FLOWS - 1); 2400 2401 if (next_offset >= tidlen) { 2402 flow->tid_offset = 0; 2403 flow->tid_idx++; 2404 } else { 2405 flow->tid_offset = next_offset; 2406 } 2407 2408 hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32); 2409 2410done: 2411 return hdwords; 2412} 2413 2414static inline struct tid_rdma_request * 2415find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode) 2416 __must_hold(&qp->s_lock) 2417{ 2418 struct rvt_swqe *wqe; 2419 struct tid_rdma_request *req = NULL; 2420 u32 i, end; 2421 2422 end = qp->s_cur + 1; 2423 if (end == qp->s_size) 2424 end = 0; 2425 for (i = qp->s_acked; i != end;) { 2426 wqe = rvt_get_swqe_ptr(qp, i); 2427 if (cmp_psn(psn, wqe->psn) >= 0 && 2428 cmp_psn(psn, wqe->lpsn) <= 0) { 2429 if (wqe->wr.opcode == opcode) 2430 req = wqe_to_tid_req(wqe); 2431 break; 2432 } 2433 if (++i == qp->s_size) 2434 i = 0; 2435 } 2436 2437 return req; 2438} 2439 2440void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet) 2441{ 2442 /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */ 2443 2444 /* 2445 * 1. Find matching SWQE 2446 * 2. Check that the entire segment has been read. 2447 * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags. 2448 * 4. Free the TID flow resources. 2449 * 5. Kick the send engine (hfi1_schedule_send()) 2450 */ 2451 struct ib_other_headers *ohdr = packet->ohdr; 2452 struct rvt_qp *qp = packet->qp; 2453 struct hfi1_qp_priv *priv = qp->priv; 2454 struct hfi1_ctxtdata *rcd = packet->rcd; 2455 struct tid_rdma_request *req; 2456 struct tid_rdma_flow *flow; 2457 u32 opcode, aeth; 2458 bool fecn; 2459 unsigned long flags; 2460 u32 kpsn, ipsn; 2461 2462 trace_hfi1_sender_rcv_tid_read_resp(qp); 2463 fecn = process_ecn(qp, packet); 2464 kpsn = mask_psn(be32_to_cpu(ohdr->bth[2])); 2465 aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth); 2466 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 2467 2468 spin_lock_irqsave(&qp->s_lock, flags); 2469 ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn)); 2470 req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ); 2471 if (unlikely(!req)) 2472 goto ack_op_err; 2473 2474 flow = &req->flows[req->clear_tail]; 2475 /* When header suppression is disabled */ 2476 if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) { 2477 update_r_next_psn_fecn(packet, priv, rcd, flow, fecn); 2478 2479 if (cmp_psn(kpsn, flow->flow_state.r_next_psn)) 2480 goto ack_done; 2481 flow->flow_state.r_next_psn = mask_psn(kpsn + 1); 2482 /* 2483 * Copy the payload to destination buffer if this packet is 2484 * delivered as an eager packet due to RSM rule and FECN. 2485 * The RSM rule selects FECN bit in BTH and SH bit in 2486 * KDETH header and therefore will not match the last 2487 * packet of each segment that has SH bit cleared. 2488 */ 2489 if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) { 2490 struct rvt_sge_state ss; 2491 u32 len; 2492 u32 tlen = packet->tlen; 2493 u16 hdrsize = packet->hlen; 2494 u8 pad = packet->pad; 2495 u8 extra_bytes = pad + packet->extra_byte + 2496 (SIZE_OF_CRC << 2); 2497 u32 pmtu = qp->pmtu; 2498 2499 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) 2500 goto ack_op_err; 2501 len = restart_sge(&ss, req->e.swqe, ipsn, pmtu); 2502 if (unlikely(len < pmtu)) 2503 goto ack_op_err; 2504 rvt_copy_sge(qp, &ss, packet->payload, pmtu, false, 2505 false); 2506 /* Raise the sw sequence check flag for next packet */ 2507 priv->s_flags |= HFI1_R_TID_SW_PSN; 2508 } 2509 2510 goto ack_done; 2511 } 2512 flow->flow_state.r_next_psn = mask_psn(kpsn + 1); 2513 req->ack_pending--; 2514 priv->pending_tid_r_segs--; 2515 qp->s_num_rd_atomic--; 2516 if ((qp->s_flags & RVT_S_WAIT_FENCE) && 2517 !qp->s_num_rd_atomic) { 2518 qp->s_flags &= ~(RVT_S_WAIT_FENCE | 2519 RVT_S_WAIT_ACK); 2520 hfi1_schedule_send(qp); 2521 } 2522 if (qp->s_flags & RVT_S_WAIT_RDMAR) { 2523 qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK); 2524 hfi1_schedule_send(qp); 2525 } 2526 2527 trace_hfi1_ack(qp, ipsn); 2528 trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode, 2529 req->e.swqe->psn, req->e.swqe->lpsn, 2530 req); 2531 trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow); 2532 2533 /* Release the tid resources */ 2534 hfi1_kern_exp_rcv_clear(req); 2535 2536 if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd)) 2537 goto ack_done; 2538 2539 /* If not done yet, build next read request */ 2540 if (++req->comp_seg >= req->total_segs) { 2541 priv->tid_r_comp++; 2542 req->state = TID_REQUEST_COMPLETE; 2543 } 2544 2545 /* 2546 * Clear the hw flow under two conditions: 2547 * 1. This request is a sync point and it is complete; 2548 * 2. Current request is completed and there are no more requests. 2549 */ 2550 if ((req->state == TID_REQUEST_SYNC && 2551 req->comp_seg == req->cur_seg) || 2552 priv->tid_r_comp == priv->tid_r_reqs) { 2553 hfi1_kern_clear_hw_flow(priv->rcd, qp); 2554 priv->s_flags &= ~HFI1_R_TID_SW_PSN; 2555 if (req->state == TID_REQUEST_SYNC) 2556 req->state = TID_REQUEST_ACTIVE; 2557 } 2558 2559 hfi1_schedule_send(qp); 2560 goto ack_done; 2561 2562ack_op_err: 2563 /* 2564 * The test indicates that the send engine has finished its cleanup 2565 * after sending the request and it's now safe to put the QP into error 2566 * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail 2567 * == qp->s_head), it would be unsafe to complete the wqe pointed by 2568 * qp->s_acked here. Putting the qp into error state will safely flush 2569 * all remaining requests. 2570 */ 2571 if (qp->s_last == qp->s_acked) 2572 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 2573 2574ack_done: 2575 spin_unlock_irqrestore(&qp->s_lock, flags); 2576} 2577 2578void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp) 2579 __must_hold(&qp->s_lock) 2580{ 2581 u32 n = qp->s_acked; 2582 struct rvt_swqe *wqe; 2583 struct tid_rdma_request *req; 2584 struct hfi1_qp_priv *priv = qp->priv; 2585 2586 lockdep_assert_held(&qp->s_lock); 2587 /* Free any TID entries */ 2588 while (n != qp->s_tail) { 2589 wqe = rvt_get_swqe_ptr(qp, n); 2590 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 2591 req = wqe_to_tid_req(wqe); 2592 hfi1_kern_exp_rcv_clear_all(req); 2593 } 2594 2595 if (++n == qp->s_size) 2596 n = 0; 2597 } 2598 /* Free flow */ 2599 hfi1_kern_clear_hw_flow(priv->rcd, qp); 2600} 2601 2602static bool tid_rdma_tid_err(struct hfi1_packet *packet, u8 rcv_type) 2603{ 2604 struct rvt_qp *qp = packet->qp; 2605 2606 if (rcv_type >= RHF_RCV_TYPE_IB) 2607 goto done; 2608 2609 spin_lock(&qp->s_lock); 2610 2611 /* 2612 * We've ran out of space in the eager buffer. 2613 * Eagerly received KDETH packets which require space in the 2614 * Eager buffer (packet that have payload) are TID RDMA WRITE 2615 * response packets. In this case, we have to re-transmit the 2616 * TID RDMA WRITE request. 2617 */ 2618 if (rcv_type == RHF_RCV_TYPE_EAGER) { 2619 hfi1_restart_rc(qp, qp->s_last_psn + 1, 1); 2620 hfi1_schedule_send(qp); 2621 } 2622 2623 /* Since no payload is delivered, just drop the packet */ 2624 spin_unlock(&qp->s_lock); 2625done: 2626 return true; 2627} 2628 2629static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd, 2630 struct rvt_qp *qp, struct rvt_swqe *wqe) 2631{ 2632 struct tid_rdma_request *req; 2633 struct tid_rdma_flow *flow; 2634 2635 /* Start from the right segment */ 2636 qp->r_flags |= RVT_R_RDMAR_SEQ; 2637 req = wqe_to_tid_req(wqe); 2638 flow = &req->flows[req->clear_tail]; 2639 hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0); 2640 if (list_empty(&qp->rspwait)) { 2641 qp->r_flags |= RVT_R_RSP_SEND; 2642 rvt_get_qp(qp); 2643 list_add_tail(&qp->rspwait, &rcd->qp_wait_list); 2644 } 2645} 2646 2647/* 2648 * Handle the KDETH eflags for TID RDMA READ response. 2649 * 2650 * Return true if the last packet for a segment has been received and it is 2651 * time to process the response normally; otherwise, return true. 2652 * 2653 * The caller must hold the packet->qp->r_lock and the rcu_read_lock. 2654 */ 2655static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd, 2656 struct hfi1_packet *packet, u8 rcv_type, 2657 u8 rte, u32 psn, u32 ibpsn) 2658 __must_hold(&packet->qp->r_lock) __must_hold(RCU) 2659{ 2660 struct hfi1_pportdata *ppd = rcd->ppd; 2661 struct hfi1_devdata *dd = ppd->dd; 2662 struct hfi1_ibport *ibp; 2663 struct rvt_swqe *wqe; 2664 struct tid_rdma_request *req; 2665 struct tid_rdma_flow *flow; 2666 u32 ack_psn; 2667 struct rvt_qp *qp = packet->qp; 2668 struct hfi1_qp_priv *priv = qp->priv; 2669 bool ret = true; 2670 int diff = 0; 2671 u32 fpsn; 2672 2673 lockdep_assert_held(&qp->r_lock); 2674 trace_hfi1_rsp_read_kdeth_eflags(qp, ibpsn); 2675 trace_hfi1_sender_read_kdeth_eflags(qp); 2676 trace_hfi1_tid_read_sender_kdeth_eflags(qp, 0); 2677 spin_lock(&qp->s_lock); 2678 /* If the psn is out of valid range, drop the packet */ 2679 if (cmp_psn(ibpsn, qp->s_last_psn) < 0 || 2680 cmp_psn(ibpsn, qp->s_psn) > 0) 2681 goto s_unlock; 2682 2683 /* 2684 * Note that NAKs implicitly ACK outstanding SEND and RDMA write 2685 * requests and implicitly NAK RDMA read and atomic requests issued 2686 * before the NAK'ed request. 2687 */ 2688 ack_psn = ibpsn - 1; 2689 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 2690 ibp = to_iport(qp->ibqp.device, qp->port_num); 2691 2692 /* Complete WQEs that the PSN finishes. */ 2693 while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) { 2694 /* 2695 * If this request is a RDMA read or atomic, and the NACK is 2696 * for a later operation, this NACK NAKs the RDMA read or 2697 * atomic. 2698 */ 2699 if (wqe->wr.opcode == IB_WR_RDMA_READ || 2700 wqe->wr.opcode == IB_WR_TID_RDMA_READ || 2701 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 2702 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { 2703 /* Retry this request. */ 2704 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { 2705 qp->r_flags |= RVT_R_RDMAR_SEQ; 2706 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 2707 restart_tid_rdma_read_req(rcd, qp, 2708 wqe); 2709 } else { 2710 hfi1_restart_rc(qp, qp->s_last_psn + 1, 2711 0); 2712 if (list_empty(&qp->rspwait)) { 2713 qp->r_flags |= RVT_R_RSP_SEND; 2714 rvt_get_qp(qp); 2715 list_add_tail(/* wait */ 2716 &qp->rspwait, 2717 &rcd->qp_wait_list); 2718 } 2719 } 2720 } 2721 /* 2722 * No need to process the NAK since we are 2723 * restarting an earlier request. 2724 */ 2725 break; 2726 } 2727 2728 wqe = do_rc_completion(qp, wqe, ibp); 2729 if (qp->s_acked == qp->s_tail) 2730 goto s_unlock; 2731 } 2732 2733 if (qp->s_acked == qp->s_tail) 2734 goto s_unlock; 2735 2736 /* Handle the eflags for the request */ 2737 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) 2738 goto s_unlock; 2739 2740 req = wqe_to_tid_req(wqe); 2741 trace_hfi1_tid_req_read_kdeth_eflags(qp, 0, wqe->wr.opcode, wqe->psn, 2742 wqe->lpsn, req); 2743 switch (rcv_type) { 2744 case RHF_RCV_TYPE_EXPECTED: 2745 switch (rte) { 2746 case RHF_RTE_EXPECTED_FLOW_SEQ_ERR: 2747 /* 2748 * On the first occurrence of a Flow Sequence error, 2749 * the flag TID_FLOW_SW_PSN is set. 2750 * 2751 * After that, the flow is *not* reprogrammed and the 2752 * protocol falls back to SW PSN checking. This is done 2753 * to prevent continuous Flow Sequence errors for any 2754 * packets that could be still in the fabric. 2755 */ 2756 flow = &req->flows[req->clear_tail]; 2757 trace_hfi1_tid_flow_read_kdeth_eflags(qp, 2758 req->clear_tail, 2759 flow); 2760 if (priv->s_flags & HFI1_R_TID_SW_PSN) { 2761 diff = cmp_psn(psn, 2762 flow->flow_state.r_next_psn); 2763 if (diff > 0) { 2764 /* Drop the packet.*/ 2765 goto s_unlock; 2766 } else if (diff < 0) { 2767 /* 2768 * If a response packet for a restarted 2769 * request has come back, reset the 2770 * restart flag. 2771 */ 2772 if (qp->r_flags & RVT_R_RDMAR_SEQ) 2773 qp->r_flags &= 2774 ~RVT_R_RDMAR_SEQ; 2775 2776 /* Drop the packet.*/ 2777 goto s_unlock; 2778 } 2779 2780 /* 2781 * If SW PSN verification is successful and 2782 * this is the last packet in the segment, tell 2783 * the caller to process it as a normal packet. 2784 */ 2785 fpsn = full_flow_psn(flow, 2786 flow->flow_state.lpsn); 2787 if (cmp_psn(fpsn, psn) == 0) { 2788 ret = false; 2789 if (qp->r_flags & RVT_R_RDMAR_SEQ) 2790 qp->r_flags &= 2791 ~RVT_R_RDMAR_SEQ; 2792 } 2793 flow->flow_state.r_next_psn = 2794 mask_psn(psn + 1); 2795 } else { 2796 u32 last_psn; 2797 2798 last_psn = read_r_next_psn(dd, rcd->ctxt, 2799 flow->idx); 2800 flow->flow_state.r_next_psn = last_psn; 2801 priv->s_flags |= HFI1_R_TID_SW_PSN; 2802 /* 2803 * If no request has been restarted yet, 2804 * restart the current one. 2805 */ 2806 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) 2807 restart_tid_rdma_read_req(rcd, qp, 2808 wqe); 2809 } 2810 2811 break; 2812 2813 case RHF_RTE_EXPECTED_FLOW_GEN_ERR: 2814 /* 2815 * Since the TID flow is able to ride through 2816 * generation mismatch, drop this stale packet. 2817 */ 2818 break; 2819 2820 default: 2821 break; 2822 } 2823 break; 2824 2825 case RHF_RCV_TYPE_ERROR: 2826 switch (rte) { 2827 case RHF_RTE_ERROR_OP_CODE_ERR: 2828 case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR: 2829 case RHF_RTE_ERROR_KHDR_HCRC_ERR: 2830 case RHF_RTE_ERROR_KHDR_KVER_ERR: 2831 case RHF_RTE_ERROR_CONTEXT_ERR: 2832 case RHF_RTE_ERROR_KHDR_TID_ERR: 2833 default: 2834 break; 2835 } 2836 break; 2837 default: 2838 break; 2839 } 2840s_unlock: 2841 spin_unlock(&qp->s_lock); 2842 return ret; 2843} 2844 2845bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, 2846 struct hfi1_pportdata *ppd, 2847 struct hfi1_packet *packet) 2848{ 2849 struct hfi1_ibport *ibp = &ppd->ibport_data; 2850 struct hfi1_devdata *dd = ppd->dd; 2851 struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; 2852 u8 rcv_type = rhf_rcv_type(packet->rhf); 2853 u8 rte = rhf_rcv_type_err(packet->rhf); 2854 struct ib_header *hdr = packet->hdr; 2855 struct ib_other_headers *ohdr = NULL; 2856 int lnh = be16_to_cpu(hdr->lrh[0]) & 3; 2857 u16 lid = be16_to_cpu(hdr->lrh[1]); 2858 u8 opcode; 2859 u32 qp_num, psn, ibpsn; 2860 struct rvt_qp *qp; 2861 struct hfi1_qp_priv *qpriv; 2862 unsigned long flags; 2863 bool ret = true; 2864 struct rvt_ack_entry *e; 2865 struct tid_rdma_request *req; 2866 struct tid_rdma_flow *flow; 2867 int diff = 0; 2868 2869 trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ", 2870 packet->rhf); 2871 if (packet->rhf & RHF_ICRC_ERR) 2872 return ret; 2873 2874 packet->ohdr = &hdr->u.oth; 2875 ohdr = packet->ohdr; 2876 trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); 2877 2878 /* Get the destination QP number. */ 2879 qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) & 2880 RVT_QPN_MASK; 2881 if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) 2882 goto drop; 2883 2884 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 2885 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 2886 2887 rcu_read_lock(); 2888 qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); 2889 if (!qp) 2890 goto rcu_unlock; 2891 2892 packet->qp = qp; 2893 2894 /* Check for valid receive state. */ 2895 spin_lock_irqsave(&qp->r_lock, flags); 2896 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { 2897 ibp->rvp.n_pkt_drops++; 2898 goto r_unlock; 2899 } 2900 2901 if (packet->rhf & RHF_TID_ERR) { 2902 /* For TIDERR and RC QPs preemptively schedule a NAK */ 2903 u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */ 2904 2905 /* Sanity check packet */ 2906 if (tlen < 24) 2907 goto r_unlock; 2908 2909 /* 2910 * Check for GRH. We should never get packets with GRH in this 2911 * path. 2912 */ 2913 if (lnh == HFI1_LRH_GRH) 2914 goto r_unlock; 2915 2916 if (tid_rdma_tid_err(packet, rcv_type)) 2917 goto r_unlock; 2918 } 2919 2920 /* handle TID RDMA READ */ 2921 if (opcode == TID_OP(READ_RESP)) { 2922 ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn); 2923 ibpsn = mask_psn(ibpsn); 2924 ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn, 2925 ibpsn); 2926 goto r_unlock; 2927 } 2928 2929 /* 2930 * qp->s_tail_ack_queue points to the rvt_ack_entry currently being 2931 * processed. These a completed sequentially so we can be sure that 2932 * the pointer will not change until the entire request has completed. 2933 */ 2934 spin_lock(&qp->s_lock); 2935 qpriv = qp->priv; 2936 if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID || 2937 qpriv->r_tid_tail == qpriv->r_tid_head) 2938 goto unlock; 2939 e = &qp->s_ack_queue[qpriv->r_tid_tail]; 2940 if (e->opcode != TID_OP(WRITE_REQ)) 2941 goto unlock; 2942 req = ack_to_tid_req(e); 2943 if (req->comp_seg == req->cur_seg) 2944 goto unlock; 2945 flow = &req->flows[req->clear_tail]; 2946 trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn); 2947 trace_hfi1_rsp_handle_kdeth_eflags(qp, psn); 2948 trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp); 2949 trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn, 2950 e->lpsn, req); 2951 trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow); 2952 2953 switch (rcv_type) { 2954 case RHF_RCV_TYPE_EXPECTED: 2955 switch (rte) { 2956 case RHF_RTE_EXPECTED_FLOW_SEQ_ERR: 2957 if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) { 2958 qpriv->s_flags |= HFI1_R_TID_SW_PSN; 2959 flow->flow_state.r_next_psn = 2960 read_r_next_psn(dd, rcd->ctxt, 2961 flow->idx); 2962 qpriv->r_next_psn_kdeth = 2963 flow->flow_state.r_next_psn; 2964 goto nak_psn; 2965 } else { 2966 /* 2967 * If the received PSN does not match the next 2968 * expected PSN, NAK the packet. 2969 * However, only do that if we know that the a 2970 * NAK has already been sent. Otherwise, this 2971 * mismatch could be due to packets that were 2972 * already in flight. 2973 */ 2974 diff = cmp_psn(psn, 2975 flow->flow_state.r_next_psn); 2976 if (diff > 0) 2977 goto nak_psn; 2978 else if (diff < 0) 2979 break; 2980 2981 qpriv->s_nak_state = 0; 2982 /* 2983 * If SW PSN verification is successful and this 2984 * is the last packet in the segment, tell the 2985 * caller to process it as a normal packet. 2986 */ 2987 if (psn == full_flow_psn(flow, 2988 flow->flow_state.lpsn)) 2989 ret = false; 2990 flow->flow_state.r_next_psn = 2991 mask_psn(psn + 1); 2992 qpriv->r_next_psn_kdeth = 2993 flow->flow_state.r_next_psn; 2994 } 2995 break; 2996 2997 case RHF_RTE_EXPECTED_FLOW_GEN_ERR: 2998 goto nak_psn; 2999 3000 default: 3001 break; 3002 } 3003 break; 3004 3005 case RHF_RCV_TYPE_ERROR: 3006 switch (rte) { 3007 case RHF_RTE_ERROR_OP_CODE_ERR: 3008 case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR: 3009 case RHF_RTE_ERROR_KHDR_HCRC_ERR: 3010 case RHF_RTE_ERROR_KHDR_KVER_ERR: 3011 case RHF_RTE_ERROR_CONTEXT_ERR: 3012 case RHF_RTE_ERROR_KHDR_TID_ERR: 3013 default: 3014 break; 3015 } 3016 break; 3017 default: 3018 break; 3019 } 3020 3021unlock: 3022 spin_unlock(&qp->s_lock); 3023r_unlock: 3024 spin_unlock_irqrestore(&qp->r_lock, flags); 3025rcu_unlock: 3026 rcu_read_unlock(); 3027drop: 3028 return ret; 3029nak_psn: 3030 ibp->rvp.n_rc_seqnak++; 3031 if (!qpriv->s_nak_state) { 3032 qpriv->s_nak_state = IB_NAK_PSN_ERROR; 3033 /* We are NAK'ing the next expected PSN */ 3034 qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn); 3035 tid_rdma_trigger_ack(qp); 3036 } 3037 goto unlock; 3038} 3039 3040/* 3041 * "Rewind" the TID request information. 3042 * This means that we reset the state back to ACTIVE, 3043 * find the proper flow, set the flow index to that flow, 3044 * and reset the flow information. 3045 */ 3046void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, 3047 u32 *bth2) 3048{ 3049 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 3050 struct tid_rdma_flow *flow; 3051 struct hfi1_qp_priv *qpriv = qp->priv; 3052 int diff, delta_pkts; 3053 u32 tididx = 0, i; 3054 u16 fidx; 3055 3056 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 3057 *bth2 = mask_psn(qp->s_psn); 3058 flow = find_flow_ib(req, *bth2, &fidx); 3059 if (!flow) { 3060 trace_hfi1_msg_tid_restart_req(/* msg */ 3061 qp, "!!!!!! Could not find flow to restart: bth2 ", 3062 (u64)*bth2); 3063 trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, 3064 wqe->psn, wqe->lpsn, 3065 req); 3066 return; 3067 } 3068 } else { 3069 fidx = req->acked_tail; 3070 flow = &req->flows[fidx]; 3071 *bth2 = mask_psn(req->r_ack_psn); 3072 } 3073 3074 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) 3075 delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn); 3076 else 3077 delta_pkts = delta_psn(*bth2, 3078 full_flow_psn(flow, 3079 flow->flow_state.spsn)); 3080 3081 trace_hfi1_tid_flow_restart_req(qp, fidx, flow); 3082 diff = delta_pkts + flow->resync_npkts; 3083 3084 flow->sent = 0; 3085 flow->pkt = 0; 3086 flow->tid_idx = 0; 3087 flow->tid_offset = 0; 3088 if (diff) { 3089 for (tididx = 0; tididx < flow->tidcnt; tididx++) { 3090 u32 tidentry = flow->tid_entry[tididx], tidlen, 3091 tidnpkts, npkts; 3092 3093 flow->tid_offset = 0; 3094 tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE; 3095 tidnpkts = rvt_div_round_up_mtu(qp, tidlen); 3096 npkts = min_t(u32, diff, tidnpkts); 3097 flow->pkt += npkts; 3098 flow->sent += (npkts == tidnpkts ? tidlen : 3099 npkts * qp->pmtu); 3100 flow->tid_offset += npkts * qp->pmtu; 3101 diff -= npkts; 3102 if (!diff) 3103 break; 3104 } 3105 } 3106 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { 3107 rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) + 3108 flow->sent, 0); 3109 /* 3110 * Packet PSN is based on flow_state.spsn + flow->pkt. However, 3111 * during a RESYNC, the generation is incremented and the 3112 * sequence is reset to 0. Since we've adjusted the npkts in the 3113 * flow and the SGE has been sufficiently advanced, we have to 3114 * adjust flow->pkt in order to calculate the correct PSN. 3115 */ 3116 flow->pkt -= flow->resync_npkts; 3117 } 3118 3119 if (flow->tid_offset == 3120 EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) { 3121 tididx++; 3122 flow->tid_offset = 0; 3123 } 3124 flow->tid_idx = tididx; 3125 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) 3126 /* Move flow_idx to correct index */ 3127 req->flow_idx = fidx; 3128 else 3129 req->clear_tail = fidx; 3130 3131 trace_hfi1_tid_flow_restart_req(qp, fidx, flow); 3132 trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn, 3133 wqe->lpsn, req); 3134 req->state = TID_REQUEST_ACTIVE; 3135 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { 3136 /* Reset all the flows that we are going to resend */ 3137 fidx = CIRC_NEXT(fidx, MAX_FLOWS); 3138 i = qpriv->s_tid_tail; 3139 do { 3140 for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS); 3141 fidx = CIRC_NEXT(fidx, MAX_FLOWS)) { 3142 req->flows[fidx].sent = 0; 3143 req->flows[fidx].pkt = 0; 3144 req->flows[fidx].tid_idx = 0; 3145 req->flows[fidx].tid_offset = 0; 3146 req->flows[fidx].resync_npkts = 0; 3147 } 3148 if (i == qpriv->s_tid_cur) 3149 break; 3150 do { 3151 i = (++i == qp->s_size ? 0 : i); 3152 wqe = rvt_get_swqe_ptr(qp, i); 3153 } while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE); 3154 req = wqe_to_tid_req(wqe); 3155 req->cur_seg = req->ack_seg; 3156 fidx = req->acked_tail; 3157 /* Pull req->clear_tail back */ 3158 req->clear_tail = fidx; 3159 } while (1); 3160 } 3161} 3162 3163void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) 3164{ 3165 int i, ret; 3166 struct hfi1_qp_priv *qpriv = qp->priv; 3167 struct tid_flow_state *fs; 3168 3169 if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA)) 3170 return; 3171 3172 /* 3173 * First, clear the flow to help prevent any delayed packets from 3174 * being delivered. 3175 */ 3176 fs = &qpriv->flow_state; 3177 if (fs->index != RXE_NUM_TID_FLOWS) 3178 hfi1_kern_clear_hw_flow(qpriv->rcd, qp); 3179 3180 for (i = qp->s_acked; i != qp->s_head;) { 3181 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); 3182 3183 if (++i == qp->s_size) 3184 i = 0; 3185 /* Free only locally allocated TID entries */ 3186 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) 3187 continue; 3188 do { 3189 struct hfi1_swqe_priv *priv = wqe->priv; 3190 3191 ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); 3192 } while (!ret); 3193 } 3194 for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) { 3195 struct rvt_ack_entry *e = &qp->s_ack_queue[i]; 3196 3197 if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device))) 3198 i = 0; 3199 /* Free only locally allocated TID entries */ 3200 if (e->opcode != TID_OP(WRITE_REQ)) 3201 continue; 3202 do { 3203 struct hfi1_ack_priv *priv = e->priv; 3204 3205 ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); 3206 } while (!ret); 3207 } 3208} 3209 3210bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe) 3211{ 3212 struct rvt_swqe *prev; 3213 struct hfi1_qp_priv *priv = qp->priv; 3214 u32 s_prev; 3215 struct tid_rdma_request *req; 3216 3217 s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1; 3218 prev = rvt_get_swqe_ptr(qp, s_prev); 3219 3220 switch (wqe->wr.opcode) { 3221 case IB_WR_SEND: 3222 case IB_WR_SEND_WITH_IMM: 3223 case IB_WR_SEND_WITH_INV: 3224 case IB_WR_ATOMIC_CMP_AND_SWP: 3225 case IB_WR_ATOMIC_FETCH_AND_ADD: 3226 case IB_WR_RDMA_WRITE: 3227 case IB_WR_RDMA_WRITE_WITH_IMM: 3228 switch (prev->wr.opcode) { 3229 case IB_WR_TID_RDMA_WRITE: 3230 req = wqe_to_tid_req(prev); 3231 if (req->ack_seg != req->total_segs) 3232 goto interlock; 3233 break; 3234 default: 3235 break; 3236 } 3237 break; 3238 case IB_WR_RDMA_READ: 3239 if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE) 3240 break; 3241 fallthrough; 3242 case IB_WR_TID_RDMA_READ: 3243 switch (prev->wr.opcode) { 3244 case IB_WR_RDMA_READ: 3245 if (qp->s_acked != qp->s_cur) 3246 goto interlock; 3247 break; 3248 case IB_WR_TID_RDMA_WRITE: 3249 req = wqe_to_tid_req(prev); 3250 if (req->ack_seg != req->total_segs) 3251 goto interlock; 3252 break; 3253 default: 3254 break; 3255 } 3256 break; 3257 default: 3258 break; 3259 } 3260 return false; 3261 3262interlock: 3263 priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK; 3264 return true; 3265} 3266 3267/* Does @sge meet the alignment requirements for tid rdma? */ 3268static inline bool hfi1_check_sge_align(struct rvt_qp *qp, 3269 struct rvt_sge *sge, int num_sge) 3270{ 3271 int i; 3272 3273 for (i = 0; i < num_sge; i++, sge++) { 3274 trace_hfi1_sge_check_align(qp, i, sge); 3275 if ((u64)sge->vaddr & ~PAGE_MASK || 3276 sge->sge_length & ~PAGE_MASK) 3277 return false; 3278 } 3279 return true; 3280} 3281 3282void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) 3283{ 3284 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; 3285 struct hfi1_swqe_priv *priv = wqe->priv; 3286 struct tid_rdma_params *remote; 3287 enum ib_wr_opcode new_opcode; 3288 bool do_tid_rdma = false; 3289 struct hfi1_pportdata *ppd = qpriv->rcd->ppd; 3290 3291 if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) == 3292 ppd->lid) 3293 return; 3294 if (qpriv->hdr_type != HFI1_PKT_TYPE_9B) 3295 return; 3296 3297 rcu_read_lock(); 3298 remote = rcu_dereference(qpriv->tid_rdma.remote); 3299 /* 3300 * If TID RDMA is disabled by the negotiation, don't 3301 * use it. 3302 */ 3303 if (!remote) 3304 goto exit; 3305 3306 if (wqe->wr.opcode == IB_WR_RDMA_READ) { 3307 if (hfi1_check_sge_align(qp, &wqe->sg_list[0], 3308 wqe->wr.num_sge)) { 3309 new_opcode = IB_WR_TID_RDMA_READ; 3310 do_tid_rdma = true; 3311 } 3312 } else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { 3313 /* 3314 * TID RDMA is enabled for this RDMA WRITE request iff: 3315 * 1. The remote address is page-aligned, 3316 * 2. The length is larger than the minimum segment size, 3317 * 3. The length is page-multiple. 3318 */ 3319 if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) && 3320 !(wqe->length & ~PAGE_MASK)) { 3321 new_opcode = IB_WR_TID_RDMA_WRITE; 3322 do_tid_rdma = true; 3323 } 3324 } 3325 3326 if (do_tid_rdma) { 3327 if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC)) 3328 goto exit; 3329 wqe->wr.opcode = new_opcode; 3330 priv->tid_req.seg_len = 3331 min_t(u32, remote->max_len, wqe->length); 3332 priv->tid_req.total_segs = 3333 DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len); 3334 /* Compute the last PSN of the request */ 3335 wqe->lpsn = wqe->psn; 3336 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 3337 priv->tid_req.n_flows = remote->max_read; 3338 qpriv->tid_r_reqs++; 3339 wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1; 3340 } else { 3341 wqe->lpsn += priv->tid_req.total_segs - 1; 3342 atomic_inc(&qpriv->n_requests); 3343 } 3344 3345 priv->tid_req.cur_seg = 0; 3346 priv->tid_req.comp_seg = 0; 3347 priv->tid_req.ack_seg = 0; 3348 priv->tid_req.state = TID_REQUEST_INACTIVE; 3349 /* 3350 * Reset acked_tail. 3351 * TID RDMA READ does not have ACKs so it does not 3352 * update the pointer. We have to reset it so TID RDMA 3353 * WRITE does not get confused. 3354 */ 3355 priv->tid_req.acked_tail = priv->tid_req.setup_head; 3356 trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode, 3357 wqe->psn, wqe->lpsn, 3358 &priv->tid_req); 3359 } 3360exit: 3361 rcu_read_unlock(); 3362} 3363 3364/* TID RDMA WRITE functions */ 3365 3366u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, 3367 struct ib_other_headers *ohdr, 3368 u32 *bth1, u32 *bth2, u32 *len) 3369{ 3370 struct hfi1_qp_priv *qpriv = qp->priv; 3371 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 3372 struct tid_rdma_params *remote; 3373 3374 rcu_read_lock(); 3375 remote = rcu_dereference(qpriv->tid_rdma.remote); 3376 /* 3377 * Set the number of flow to be used based on negotiated 3378 * parameters. 3379 */ 3380 req->n_flows = remote->max_write; 3381 req->state = TID_REQUEST_ACTIVE; 3382 3383 KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1); 3384 KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey); 3385 ohdr->u.tid_rdma.w_req.reth.vaddr = 3386 cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len)); 3387 ohdr->u.tid_rdma.w_req.reth.rkey = 3388 cpu_to_be32(wqe->rdma_wr.rkey); 3389 ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len); 3390 ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn); 3391 *bth1 &= ~RVT_QPN_MASK; 3392 *bth1 |= remote->qp; 3393 qp->s_state = TID_OP(WRITE_REQ); 3394 qp->s_flags |= HFI1_S_WAIT_TID_RESP; 3395 *bth2 |= IB_BTH_REQ_ACK; 3396 *len = 0; 3397 3398 rcu_read_unlock(); 3399 return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32); 3400} 3401 3402static u32 hfi1_compute_tid_rdma_flow_wt(struct rvt_qp *qp) 3403{ 3404 /* 3405 * Heuristic for computing the RNR timeout when waiting on the flow 3406 * queue. Rather than a computationaly expensive exact estimate of when 3407 * a flow will be available, we assume that if a QP is at position N in 3408 * the flow queue it has to wait approximately (N + 1) * (number of 3409 * segments between two sync points). The rationale for this is that 3410 * flows are released and recycled at each sync point. 3411 */ 3412 return (MAX_TID_FLOW_PSN * qp->pmtu) >> TID_RDMA_SEGMENT_SHIFT; 3413} 3414 3415static u32 position_in_queue(struct hfi1_qp_priv *qpriv, 3416 struct tid_queue *queue) 3417{ 3418 return qpriv->tid_enqueue - queue->dequeue; 3419} 3420 3421/* 3422 * @qp: points to rvt_qp context. 3423 * @to_seg: desired RNR timeout in segments. 3424 * Return: index of the next highest timeout in the ib_hfi1_rnr_table[] 3425 */ 3426static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg) 3427{ 3428 struct hfi1_qp_priv *qpriv = qp->priv; 3429 u64 timeout; 3430 u32 bytes_per_us; 3431 u8 i; 3432 3433 bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8; 3434 timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us; 3435 /* 3436 * Find the next highest value in the RNR table to the required 3437 * timeout. This gives the responder some padding. 3438 */ 3439 for (i = 1; i <= IB_AETH_CREDIT_MASK; i++) 3440 if (rvt_rnr_tbl_to_usec(i) >= timeout) 3441 return i; 3442 return 0; 3443} 3444 3445/* 3446 * Central place for resource allocation at TID write responder, 3447 * is called from write_req and write_data interrupt handlers as 3448 * well as the send thread when a queued QP is scheduled for 3449 * resource allocation. 3450 * 3451 * Iterates over (a) segments of a request and then (b) queued requests 3452 * themselves to allocate resources for up to local->max_write 3453 * segments across multiple requests. Stop allocating when we 3454 * hit a sync point, resume allocating after data packets at 3455 * sync point have been received. 3456 * 3457 * Resource allocation and sending of responses is decoupled. The 3458 * request/segment which are being allocated and sent are as follows. 3459 * Resources are allocated for: 3460 * [request: qpriv->r_tid_alloc, segment: req->alloc_seg] 3461 * The send thread sends: 3462 * [request: qp->s_tail_ack_queue, segment:req->cur_seg] 3463 */ 3464static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) 3465{ 3466 struct tid_rdma_request *req; 3467 struct hfi1_qp_priv *qpriv = qp->priv; 3468 struct hfi1_ctxtdata *rcd = qpriv->rcd; 3469 struct tid_rdma_params *local = &qpriv->tid_rdma.local; 3470 struct rvt_ack_entry *e; 3471 u32 npkts, to_seg; 3472 bool last; 3473 int ret = 0; 3474 3475 lockdep_assert_held(&qp->s_lock); 3476 3477 while (1) { 3478 trace_hfi1_rsp_tid_write_alloc_res(qp, 0); 3479 trace_hfi1_tid_write_rsp_alloc_res(qp); 3480 /* 3481 * Don't allocate more segments if a RNR NAK has already been 3482 * scheduled to avoid messing up qp->r_psn: the RNR NAK will 3483 * be sent only when all allocated segments have been sent. 3484 * However, if more segments are allocated before that, TID RDMA 3485 * WRITE RESP packets will be sent out for these new segments 3486 * before the RNR NAK packet. When the requester receives the 3487 * RNR NAK packet, it will restart with qp->s_last_psn + 1, 3488 * which does not match qp->r_psn and will be dropped. 3489 * Consequently, the requester will exhaust its retries and 3490 * put the qp into error state. 3491 */ 3492 if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND) 3493 break; 3494 3495 /* No requests left to process */ 3496 if (qpriv->r_tid_alloc == qpriv->r_tid_head) { 3497 /* If all data has been received, clear the flow */ 3498 if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS && 3499 !qpriv->alloc_w_segs) { 3500 hfi1_kern_clear_hw_flow(rcd, qp); 3501 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 3502 } 3503 break; 3504 } 3505 3506 e = &qp->s_ack_queue[qpriv->r_tid_alloc]; 3507 if (e->opcode != TID_OP(WRITE_REQ)) 3508 goto next_req; 3509 req = ack_to_tid_req(e); 3510 trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn, 3511 e->lpsn, req); 3512 /* Finished allocating for all segments of this request */ 3513 if (req->alloc_seg >= req->total_segs) 3514 goto next_req; 3515 3516 /* Can allocate only a maximum of local->max_write for a QP */ 3517 if (qpriv->alloc_w_segs >= local->max_write) 3518 break; 3519 3520 /* Don't allocate at a sync point with data packets pending */ 3521 if (qpriv->sync_pt && qpriv->alloc_w_segs) 3522 break; 3523 3524 /* All data received at the sync point, continue */ 3525 if (qpriv->sync_pt && !qpriv->alloc_w_segs) { 3526 hfi1_kern_clear_hw_flow(rcd, qp); 3527 qpriv->sync_pt = false; 3528 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 3529 } 3530 3531 /* Allocate flow if we don't have one */ 3532 if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) { 3533 ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp); 3534 if (ret) { 3535 to_seg = hfi1_compute_tid_rdma_flow_wt(qp) * 3536 position_in_queue(qpriv, 3537 &rcd->flow_queue); 3538 break; 3539 } 3540 } 3541 3542 npkts = rvt_div_round_up_mtu(qp, req->seg_len); 3543 3544 /* 3545 * We are at a sync point if we run out of KDETH PSN space. 3546 * Last PSN of every generation is reserved for RESYNC. 3547 */ 3548 if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) { 3549 qpriv->sync_pt = true; 3550 break; 3551 } 3552 3553 /* 3554 * If overtaking req->acked_tail, send an RNR NAK. Because the 3555 * QP is not queued in this case, and the issue can only be 3556 * caused by a delay in scheduling the second leg which we 3557 * cannot estimate, we use a rather arbitrary RNR timeout of 3558 * (MAX_FLOWS / 2) segments 3559 */ 3560 if (!CIRC_SPACE(req->setup_head, req->acked_tail, 3561 MAX_FLOWS)) { 3562 ret = -EAGAIN; 3563 to_seg = MAX_FLOWS >> 1; 3564 tid_rdma_trigger_ack(qp); 3565 break; 3566 } 3567 3568 /* Try to allocate rcv array / TID entries */ 3569 ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last); 3570 if (ret == -EAGAIN) 3571 to_seg = position_in_queue(qpriv, &rcd->rarr_queue); 3572 if (ret) 3573 break; 3574 3575 qpriv->alloc_w_segs++; 3576 req->alloc_seg++; 3577 continue; 3578next_req: 3579 /* Begin processing the next request */ 3580 if (++qpriv->r_tid_alloc > 3581 rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3582 qpriv->r_tid_alloc = 0; 3583 } 3584 3585 /* 3586 * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation 3587 * has failed (b) we are called from the rcv handler interrupt context 3588 * (c) an RNR NAK has not already been scheduled 3589 */ 3590 if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state) 3591 goto send_rnr_nak; 3592 3593 return; 3594 3595send_rnr_nak: 3596 lockdep_assert_held(&qp->r_lock); 3597 3598 /* Set r_nak_state to prevent unrelated events from generating NAK's */ 3599 qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK; 3600 3601 /* Pull back r_psn to the segment being RNR NAK'd */ 3602 qp->r_psn = e->psn + req->alloc_seg; 3603 qp->r_ack_psn = qp->r_psn; 3604 /* 3605 * Pull back r_head_ack_queue to the ack entry following the request 3606 * being RNR NAK'd. This allows resources to be allocated to the request 3607 * if the queued QP is scheduled. 3608 */ 3609 qp->r_head_ack_queue = qpriv->r_tid_alloc + 1; 3610 if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3611 qp->r_head_ack_queue = 0; 3612 qpriv->r_tid_head = qp->r_head_ack_queue; 3613 /* 3614 * These send side fields are used in make_rc_ack(). They are set in 3615 * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock 3616 * for consistency 3617 */ 3618 qp->s_nak_state = qp->r_nak_state; 3619 qp->s_ack_psn = qp->r_ack_psn; 3620 /* 3621 * Clear the ACK PENDING flag to prevent unwanted ACK because we 3622 * have modified qp->s_ack_psn here. 3623 */ 3624 qp->s_flags &= ~(RVT_S_ACK_PENDING); 3625 3626 trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn); 3627 /* 3628 * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK 3629 * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be 3630 * used for this because qp->s_lock is dropped before calling 3631 * hfi1_send_rc_ack() leading to inconsistency between the receive 3632 * interrupt handlers and the send thread in make_rc_ack() 3633 */ 3634 qpriv->rnr_nak_state = TID_RNR_NAK_SEND; 3635 3636 /* 3637 * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive 3638 * interrupt handlers but will be sent from the send engine behind any 3639 * previous responses that may have been scheduled 3640 */ 3641 rc_defered_ack(rcd, qp); 3642} 3643 3644void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) 3645{ 3646 /* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/ 3647 3648 /* 3649 * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST 3650 * (see hfi1_rc_rcv()) 3651 * - Don't allow 0-length requests. 3652 * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue) 3653 * - Setup struct tid_rdma_req with request info 3654 * - Prepare struct tid_rdma_flow array? 3655 * 3. Set the qp->s_ack_state as state diagram in design doc. 3656 * 4. Set RVT_S_RESP_PENDING in s_flags. 3657 * 5. Kick the send engine (hfi1_schedule_send()) 3658 */ 3659 struct hfi1_ctxtdata *rcd = packet->rcd; 3660 struct rvt_qp *qp = packet->qp; 3661 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 3662 struct ib_other_headers *ohdr = packet->ohdr; 3663 struct rvt_ack_entry *e; 3664 unsigned long flags; 3665 struct ib_reth *reth; 3666 struct hfi1_qp_priv *qpriv = qp->priv; 3667 struct tid_rdma_request *req; 3668 u32 bth0, psn, len, rkey, num_segs; 3669 bool fecn; 3670 u8 next; 3671 u64 vaddr; 3672 int diff; 3673 3674 bth0 = be32_to_cpu(ohdr->bth[0]); 3675 if (hfi1_ruc_check_hdr(ibp, packet)) 3676 return; 3677 3678 fecn = process_ecn(qp, packet); 3679 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 3680 trace_hfi1_rsp_rcv_tid_write_req(qp, psn); 3681 3682 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) 3683 rvt_comm_est(qp); 3684 3685 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) 3686 goto nack_inv; 3687 3688 reth = &ohdr->u.tid_rdma.w_req.reth; 3689 vaddr = be64_to_cpu(reth->vaddr); 3690 len = be32_to_cpu(reth->length); 3691 3692 num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len); 3693 diff = delta_psn(psn, qp->r_psn); 3694 if (unlikely(diff)) { 3695 tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn); 3696 return; 3697 } 3698 3699 /* 3700 * The resent request which was previously RNR NAK'd is inserted at the 3701 * location of the original request, which is one entry behind 3702 * r_head_ack_queue 3703 */ 3704 if (qpriv->rnr_nak_state) 3705 qp->r_head_ack_queue = qp->r_head_ack_queue ? 3706 qp->r_head_ack_queue - 1 : 3707 rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); 3708 3709 /* We've verified the request, insert it into the ack queue. */ 3710 next = qp->r_head_ack_queue + 1; 3711 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3712 next = 0; 3713 spin_lock_irqsave(&qp->s_lock, flags); 3714 if (unlikely(next == qp->s_acked_ack_queue)) { 3715 if (!qp->s_ack_queue[next].sent) 3716 goto nack_inv_unlock; 3717 update_ack_queue(qp, next); 3718 } 3719 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 3720 req = ack_to_tid_req(e); 3721 3722 /* Bring previously RNR NAK'd request back to life */ 3723 if (qpriv->rnr_nak_state) { 3724 qp->r_nak_state = 0; 3725 qp->s_nak_state = 0; 3726 qpriv->rnr_nak_state = TID_RNR_NAK_INIT; 3727 qp->r_psn = e->lpsn + 1; 3728 req->state = TID_REQUEST_INIT; 3729 goto update_head; 3730 } 3731 3732 release_rdma_sge_mr(e); 3733 3734 /* The length needs to be in multiples of PAGE_SIZE */ 3735 if (!len || len & ~PAGE_MASK) 3736 goto nack_inv_unlock; 3737 3738 rkey = be32_to_cpu(reth->rkey); 3739 qp->r_len = len; 3740 3741 if (e->opcode == TID_OP(WRITE_REQ) && 3742 (req->setup_head != req->clear_tail || 3743 req->clear_tail != req->acked_tail)) 3744 goto nack_inv_unlock; 3745 3746 if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, 3747 rkey, IB_ACCESS_REMOTE_WRITE))) 3748 goto nack_acc; 3749 3750 qp->r_psn += num_segs - 1; 3751 3752 e->opcode = (bth0 >> 24) & 0xff; 3753 e->psn = psn; 3754 e->lpsn = qp->r_psn; 3755 e->sent = 0; 3756 3757 req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write); 3758 req->state = TID_REQUEST_INIT; 3759 req->cur_seg = 0; 3760 req->comp_seg = 0; 3761 req->ack_seg = 0; 3762 req->alloc_seg = 0; 3763 req->isge = 0; 3764 req->seg_len = qpriv->tid_rdma.local.max_len; 3765 req->total_len = len; 3766 req->total_segs = num_segs; 3767 req->r_flow_psn = e->psn; 3768 req->ss.sge = e->rdma_sge; 3769 req->ss.num_sge = 1; 3770 3771 req->flow_idx = req->setup_head; 3772 req->clear_tail = req->setup_head; 3773 req->acked_tail = req->setup_head; 3774 3775 qp->r_state = e->opcode; 3776 qp->r_nak_state = 0; 3777 /* 3778 * We need to increment the MSN here instead of when we 3779 * finish sending the result since a duplicate request would 3780 * increment it more than once. 3781 */ 3782 qp->r_msn++; 3783 qp->r_psn++; 3784 3785 trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn, 3786 req); 3787 3788 if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) { 3789 qpriv->r_tid_tail = qp->r_head_ack_queue; 3790 } else if (qpriv->r_tid_tail == qpriv->r_tid_head) { 3791 struct tid_rdma_request *ptr; 3792 3793 e = &qp->s_ack_queue[qpriv->r_tid_tail]; 3794 ptr = ack_to_tid_req(e); 3795 3796 if (e->opcode != TID_OP(WRITE_REQ) || 3797 ptr->comp_seg == ptr->total_segs) { 3798 if (qpriv->r_tid_tail == qpriv->r_tid_ack) 3799 qpriv->r_tid_ack = qp->r_head_ack_queue; 3800 qpriv->r_tid_tail = qp->r_head_ack_queue; 3801 } 3802 } 3803update_head: 3804 qp->r_head_ack_queue = next; 3805 qpriv->r_tid_head = qp->r_head_ack_queue; 3806 3807 hfi1_tid_write_alloc_resources(qp, true); 3808 trace_hfi1_tid_write_rsp_rcv_req(qp); 3809 3810 /* Schedule the send tasklet. */ 3811 qp->s_flags |= RVT_S_RESP_PENDING; 3812 if (fecn) 3813 qp->s_flags |= RVT_S_ECN; 3814 hfi1_schedule_send(qp); 3815 3816 spin_unlock_irqrestore(&qp->s_lock, flags); 3817 return; 3818 3819nack_inv_unlock: 3820 spin_unlock_irqrestore(&qp->s_lock, flags); 3821nack_inv: 3822 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 3823 qp->r_nak_state = IB_NAK_INVALID_REQUEST; 3824 qp->r_ack_psn = qp->r_psn; 3825 /* Queue NAK for later */ 3826 rc_defered_ack(rcd, qp); 3827 return; 3828nack_acc: 3829 spin_unlock_irqrestore(&qp->s_lock, flags); 3830 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); 3831 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; 3832 qp->r_ack_psn = qp->r_psn; 3833} 3834 3835u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, 3836 struct ib_other_headers *ohdr, u32 *bth1, 3837 u32 bth2, u32 *len, 3838 struct rvt_sge_state **ss) 3839{ 3840 struct hfi1_ack_priv *epriv = e->priv; 3841 struct tid_rdma_request *req = &epriv->tid_req; 3842 struct hfi1_qp_priv *qpriv = qp->priv; 3843 struct tid_rdma_flow *flow = NULL; 3844 u32 resp_len = 0, hdwords = 0; 3845 void *resp_addr = NULL; 3846 struct tid_rdma_params *remote; 3847 3848 trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn, 3849 req); 3850 trace_hfi1_tid_write_rsp_build_resp(qp); 3851 trace_hfi1_rsp_build_tid_write_resp(qp, bth2); 3852 flow = &req->flows[req->flow_idx]; 3853 switch (req->state) { 3854 default: 3855 /* 3856 * Try to allocate resources here in case QP was queued and was 3857 * later scheduled when resources became available 3858 */ 3859 hfi1_tid_write_alloc_resources(qp, false); 3860 3861 /* We've already sent everything which is ready */ 3862 if (req->cur_seg >= req->alloc_seg) 3863 goto done; 3864 3865 /* 3866 * Resources can be assigned but responses cannot be sent in 3867 * rnr_nak state, till the resent request is received 3868 */ 3869 if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT) 3870 goto done; 3871 3872 req->state = TID_REQUEST_ACTIVE; 3873 trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); 3874 req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); 3875 hfi1_add_tid_reap_timer(qp); 3876 break; 3877 3878 case TID_REQUEST_RESEND_ACTIVE: 3879 case TID_REQUEST_RESEND: 3880 trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); 3881 req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); 3882 if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS)) 3883 req->state = TID_REQUEST_ACTIVE; 3884 3885 hfi1_mod_tid_reap_timer(qp); 3886 break; 3887 } 3888 flow->flow_state.resp_ib_psn = bth2; 3889 resp_addr = (void *)flow->tid_entry; 3890 resp_len = sizeof(*flow->tid_entry) * flow->tidcnt; 3891 req->cur_seg++; 3892 3893 memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp)); 3894 epriv->ss.sge.vaddr = resp_addr; 3895 epriv->ss.sge.sge_length = resp_len; 3896 epriv->ss.sge.length = epriv->ss.sge.sge_length; 3897 /* 3898 * We can safely zero these out. Since the first SGE covers the 3899 * entire packet, nothing else should even look at the MR. 3900 */ 3901 epriv->ss.sge.mr = NULL; 3902 epriv->ss.sge.m = 0; 3903 epriv->ss.sge.n = 0; 3904 3905 epriv->ss.sg_list = NULL; 3906 epriv->ss.total_len = epriv->ss.sge.sge_length; 3907 epriv->ss.num_sge = 1; 3908 3909 *ss = &epriv->ss; 3910 *len = epriv->ss.total_len; 3911 3912 /* Construct the TID RDMA WRITE RESP packet header */ 3913 rcu_read_lock(); 3914 remote = rcu_dereference(qpriv->tid_rdma.remote); 3915 3916 KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1); 3917 KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey); 3918 ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp); 3919 ohdr->u.tid_rdma.w_rsp.tid_flow_psn = 3920 cpu_to_be32((flow->flow_state.generation << 3921 HFI1_KDETH_BTH_SEQ_SHIFT) | 3922 (flow->flow_state.spsn & 3923 HFI1_KDETH_BTH_SEQ_MASK)); 3924 ohdr->u.tid_rdma.w_rsp.tid_flow_qp = 3925 cpu_to_be32(qpriv->tid_rdma.local.qp | 3926 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << 3927 TID_RDMA_DESTQP_FLOW_SHIFT) | 3928 qpriv->rcd->ctxt); 3929 ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn); 3930 *bth1 = remote->qp; 3931 rcu_read_unlock(); 3932 hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32); 3933 qpriv->pending_tid_w_segs++; 3934done: 3935 return hdwords; 3936} 3937 3938static void hfi1_add_tid_reap_timer(struct rvt_qp *qp) 3939{ 3940 struct hfi1_qp_priv *qpriv = qp->priv; 3941 3942 lockdep_assert_held(&qp->s_lock); 3943 if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) { 3944 qpriv->s_flags |= HFI1_R_TID_RSC_TIMER; 3945 qpriv->s_tid_timer.expires = jiffies + 3946 qpriv->tid_timer_timeout_jiffies; 3947 add_timer(&qpriv->s_tid_timer); 3948 } 3949} 3950 3951static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp) 3952{ 3953 struct hfi1_qp_priv *qpriv = qp->priv; 3954 3955 lockdep_assert_held(&qp->s_lock); 3956 qpriv->s_flags |= HFI1_R_TID_RSC_TIMER; 3957 mod_timer(&qpriv->s_tid_timer, jiffies + 3958 qpriv->tid_timer_timeout_jiffies); 3959} 3960 3961static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp) 3962{ 3963 struct hfi1_qp_priv *qpriv = qp->priv; 3964 int rval = 0; 3965 3966 lockdep_assert_held(&qp->s_lock); 3967 if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { 3968 rval = del_timer(&qpriv->s_tid_timer); 3969 qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; 3970 } 3971 return rval; 3972} 3973 3974void hfi1_del_tid_reap_timer(struct rvt_qp *qp) 3975{ 3976 struct hfi1_qp_priv *qpriv = qp->priv; 3977 3978 del_timer_sync(&qpriv->s_tid_timer); 3979 qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; 3980} 3981 3982static void hfi1_tid_timeout(struct timer_list *t) 3983{ 3984 struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer); 3985 struct rvt_qp *qp = qpriv->owner; 3986 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); 3987 unsigned long flags; 3988 u32 i; 3989 3990 spin_lock_irqsave(&qp->r_lock, flags); 3991 spin_lock(&qp->s_lock); 3992 if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { 3993 dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n", 3994 qp->ibqp.qp_num, __func__, __LINE__); 3995 trace_hfi1_msg_tid_timeout(/* msg */ 3996 qp, "resource timeout = ", 3997 (u64)qpriv->tid_timer_timeout_jiffies); 3998 hfi1_stop_tid_reap_timer(qp); 3999 /* 4000 * Go though the entire ack queue and clear any outstanding 4001 * HW flow and RcvArray resources. 4002 */ 4003 hfi1_kern_clear_hw_flow(qpriv->rcd, qp); 4004 for (i = 0; i < rvt_max_atomic(rdi); i++) { 4005 struct tid_rdma_request *req = 4006 ack_to_tid_req(&qp->s_ack_queue[i]); 4007 4008 hfi1_kern_exp_rcv_clear_all(req); 4009 } 4010 spin_unlock(&qp->s_lock); 4011 if (qp->ibqp.event_handler) { 4012 struct ib_event ev; 4013 4014 ev.device = qp->ibqp.device; 4015 ev.element.qp = &qp->ibqp; 4016 ev.event = IB_EVENT_QP_FATAL; 4017 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); 4018 } 4019 rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR); 4020 goto unlock_r_lock; 4021 } 4022 spin_unlock(&qp->s_lock); 4023unlock_r_lock: 4024 spin_unlock_irqrestore(&qp->r_lock, flags); 4025} 4026 4027void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) 4028{ 4029 /* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */ 4030 4031 /* 4032 * 1. Find matching SWQE 4033 * 2. Check that TIDENTRY array has enough space for a complete 4034 * segment. If not, put QP in error state. 4035 * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow 4036 * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags. 4037 * 5. Set qp->s_state 4038 * 6. Kick the send engine (hfi1_schedule_send()) 4039 */ 4040 struct ib_other_headers *ohdr = packet->ohdr; 4041 struct rvt_qp *qp = packet->qp; 4042 struct hfi1_qp_priv *qpriv = qp->priv; 4043 struct hfi1_ctxtdata *rcd = packet->rcd; 4044 struct rvt_swqe *wqe; 4045 struct tid_rdma_request *req; 4046 struct tid_rdma_flow *flow; 4047 enum ib_wc_status status; 4048 u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen; 4049 bool fecn; 4050 unsigned long flags; 4051 4052 fecn = process_ecn(qp, packet); 4053 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4054 aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth); 4055 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 4056 4057 spin_lock_irqsave(&qp->s_lock, flags); 4058 4059 /* Ignore invalid responses */ 4060 if (cmp_psn(psn, qp->s_next_psn) >= 0) 4061 goto ack_done; 4062 4063 /* Ignore duplicate responses. */ 4064 if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0)) 4065 goto ack_done; 4066 4067 if (unlikely(qp->s_acked == qp->s_tail)) 4068 goto ack_done; 4069 4070 /* 4071 * If we are waiting for a particular packet sequence number 4072 * due to a request being resent, check for it. Otherwise, 4073 * ensure that we haven't missed anything. 4074 */ 4075 if (qp->r_flags & RVT_R_RDMAR_SEQ) { 4076 if (cmp_psn(psn, qp->s_last_psn + 1) != 0) 4077 goto ack_done; 4078 qp->r_flags &= ~RVT_R_RDMAR_SEQ; 4079 } 4080 4081 wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); 4082 if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)) 4083 goto ack_op_err; 4084 4085 req = wqe_to_tid_req(wqe); 4086 /* 4087 * If we've lost ACKs and our acked_tail pointer is too far 4088 * behind, don't overwrite segments. Just drop the packet and 4089 * let the reliability protocol take care of it. 4090 */ 4091 if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS)) 4092 goto ack_done; 4093 4094 /* 4095 * The call to do_rc_ack() should be last in the chain of 4096 * packet checks because it will end up updating the QP state. 4097 * Therefore, anything that would prevent the packet from 4098 * being accepted as a successful response should be prior 4099 * to it. 4100 */ 4101 if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) 4102 goto ack_done; 4103 4104 trace_hfi1_ack(qp, psn); 4105 4106 flow = &req->flows[req->setup_head]; 4107 flow->pkt = 0; 4108 flow->tid_idx = 0; 4109 flow->tid_offset = 0; 4110 flow->sent = 0; 4111 flow->resync_npkts = 0; 4112 flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp); 4113 flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) & 4114 TID_RDMA_DESTQP_FLOW_MASK; 4115 flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn)); 4116 flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT; 4117 flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK; 4118 flow->flow_state.resp_ib_psn = psn; 4119 flow->length = min_t(u32, req->seg_len, 4120 (wqe->length - (req->comp_seg * req->seg_len))); 4121 4122 flow->npkts = rvt_div_round_up_mtu(qp, flow->length); 4123 flow->flow_state.lpsn = flow->flow_state.spsn + 4124 flow->npkts - 1; 4125 /* payload length = packet length - (header length + ICRC length) */ 4126 pktlen = packet->tlen - (packet->hlen + 4); 4127 if (pktlen > sizeof(flow->tid_entry)) { 4128 status = IB_WC_LOC_LEN_ERR; 4129 goto ack_err; 4130 } 4131 memcpy(flow->tid_entry, packet->ebuf, pktlen); 4132 flow->tidcnt = pktlen / sizeof(*flow->tid_entry); 4133 trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow); 4134 4135 req->comp_seg++; 4136 trace_hfi1_tid_write_sender_rcv_resp(qp, 0); 4137 /* 4138 * Walk the TID_ENTRY list to make sure we have enough space for a 4139 * complete segment. 4140 */ 4141 for (i = 0; i < flow->tidcnt; i++) { 4142 trace_hfi1_tid_entry_rcv_write_resp(/* entry */ 4143 qp, i, flow->tid_entry[i]); 4144 if (!EXP_TID_GET(flow->tid_entry[i], LEN)) { 4145 status = IB_WC_LOC_LEN_ERR; 4146 goto ack_err; 4147 } 4148 tidlen += EXP_TID_GET(flow->tid_entry[i], LEN); 4149 } 4150 if (tidlen * PAGE_SIZE < flow->length) { 4151 status = IB_WC_LOC_LEN_ERR; 4152 goto ack_err; 4153 } 4154 4155 trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn, 4156 wqe->lpsn, req); 4157 /* 4158 * If this is the first response for this request, set the initial 4159 * flow index to the current flow. 4160 */ 4161 if (!cmp_psn(psn, wqe->psn)) { 4162 req->r_last_acked = mask_psn(wqe->psn - 1); 4163 /* Set acked flow index to head index */ 4164 req->acked_tail = req->setup_head; 4165 } 4166 4167 /* advance circular buffer head */ 4168 req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS); 4169 req->state = TID_REQUEST_ACTIVE; 4170 4171 /* 4172 * If all responses for this TID RDMA WRITE request have been received 4173 * advance the pointer to the next one. 4174 * Since TID RDMA requests could be mixed in with regular IB requests, 4175 * they might not appear sequentially in the queue. Therefore, the 4176 * next request needs to be "found". 4177 */ 4178 if (qpriv->s_tid_cur != qpriv->s_tid_head && 4179 req->comp_seg == req->total_segs) { 4180 for (i = qpriv->s_tid_cur + 1; ; i++) { 4181 if (i == qp->s_size) 4182 i = 0; 4183 wqe = rvt_get_swqe_ptr(qp, i); 4184 if (i == qpriv->s_tid_head) 4185 break; 4186 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) 4187 break; 4188 } 4189 qpriv->s_tid_cur = i; 4190 } 4191 qp->s_flags &= ~HFI1_S_WAIT_TID_RESP; 4192 hfi1_schedule_tid_send(qp); 4193 goto ack_done; 4194 4195ack_op_err: 4196 status = IB_WC_LOC_QP_OP_ERR; 4197ack_err: 4198 rvt_error_qp(qp, status); 4199ack_done: 4200 if (fecn) 4201 qp->s_flags |= RVT_S_ECN; 4202 spin_unlock_irqrestore(&qp->s_lock, flags); 4203} 4204 4205bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, 4206 struct ib_other_headers *ohdr, 4207 u32 *bth1, u32 *bth2, u32 *len) 4208{ 4209 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 4210 struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; 4211 struct tid_rdma_params *remote; 4212 struct rvt_qp *qp = req->qp; 4213 struct hfi1_qp_priv *qpriv = qp->priv; 4214 u32 tidentry = flow->tid_entry[flow->tid_idx]; 4215 u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT; 4216 struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data; 4217 u32 next_offset, om = KDETH_OM_LARGE; 4218 bool last_pkt; 4219 4220 if (!tidlen) { 4221 hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR); 4222 rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR); 4223 } 4224 4225 *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset); 4226 flow->sent += *len; 4227 next_offset = flow->tid_offset + *len; 4228 last_pkt = (flow->tid_idx == (flow->tidcnt - 1) && 4229 next_offset >= tidlen) || (flow->sent >= flow->length); 4230 trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry); 4231 trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow); 4232 4233 rcu_read_lock(); 4234 remote = rcu_dereference(qpriv->tid_rdma.remote); 4235 KDETH_RESET(wd->kdeth0, KVER, 0x1); 4236 KDETH_SET(wd->kdeth0, SH, !last_pkt); 4237 KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg)); 4238 KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL)); 4239 KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX)); 4240 KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE); 4241 KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om); 4242 KDETH_RESET(wd->kdeth1, JKEY, remote->jkey); 4243 wd->verbs_qp = cpu_to_be32(qp->remote_qpn); 4244 rcu_read_unlock(); 4245 4246 *bth1 = flow->tid_qpn; 4247 *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) & 4248 HFI1_KDETH_BTH_SEQ_MASK) | 4249 (flow->flow_state.generation << 4250 HFI1_KDETH_BTH_SEQ_SHIFT)); 4251 if (last_pkt) { 4252 /* PSNs are zero-based, so +1 to count number of packets */ 4253 if (flow->flow_state.lpsn + 1 + 4254 rvt_div_round_up_mtu(qp, req->seg_len) > 4255 MAX_TID_FLOW_PSN) 4256 req->state = TID_REQUEST_SYNC; 4257 *bth2 |= IB_BTH_REQ_ACK; 4258 } 4259 4260 if (next_offset >= tidlen) { 4261 flow->tid_offset = 0; 4262 flow->tid_idx++; 4263 } else { 4264 flow->tid_offset = next_offset; 4265 } 4266 return last_pkt; 4267} 4268 4269void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) 4270{ 4271 struct rvt_qp *qp = packet->qp; 4272 struct hfi1_qp_priv *priv = qp->priv; 4273 struct hfi1_ctxtdata *rcd = priv->rcd; 4274 struct ib_other_headers *ohdr = packet->ohdr; 4275 struct rvt_ack_entry *e; 4276 struct tid_rdma_request *req; 4277 struct tid_rdma_flow *flow; 4278 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 4279 unsigned long flags; 4280 u32 psn, next; 4281 u8 opcode; 4282 bool fecn; 4283 4284 fecn = process_ecn(qp, packet); 4285 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4286 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 4287 4288 /* 4289 * All error handling should be done by now. If we are here, the packet 4290 * is either good or been accepted by the error handler. 4291 */ 4292 spin_lock_irqsave(&qp->s_lock, flags); 4293 e = &qp->s_ack_queue[priv->r_tid_tail]; 4294 req = ack_to_tid_req(e); 4295 flow = &req->flows[req->clear_tail]; 4296 if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) { 4297 update_r_next_psn_fecn(packet, priv, rcd, flow, fecn); 4298 4299 if (cmp_psn(psn, flow->flow_state.r_next_psn)) 4300 goto send_nak; 4301 4302 flow->flow_state.r_next_psn = mask_psn(psn + 1); 4303 /* 4304 * Copy the payload to destination buffer if this packet is 4305 * delivered as an eager packet due to RSM rule and FECN. 4306 * The RSM rule selects FECN bit in BTH and SH bit in 4307 * KDETH header and therefore will not match the last 4308 * packet of each segment that has SH bit cleared. 4309 */ 4310 if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) { 4311 struct rvt_sge_state ss; 4312 u32 len; 4313 u32 tlen = packet->tlen; 4314 u16 hdrsize = packet->hlen; 4315 u8 pad = packet->pad; 4316 u8 extra_bytes = pad + packet->extra_byte + 4317 (SIZE_OF_CRC << 2); 4318 u32 pmtu = qp->pmtu; 4319 4320 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) 4321 goto send_nak; 4322 len = req->comp_seg * req->seg_len; 4323 len += delta_psn(psn, 4324 full_flow_psn(flow, flow->flow_state.spsn)) * 4325 pmtu; 4326 if (unlikely(req->total_len - len < pmtu)) 4327 goto send_nak; 4328 4329 /* 4330 * The e->rdma_sge field is set when TID RDMA WRITE REQ 4331 * is first received and is never modified thereafter. 4332 */ 4333 ss.sge = e->rdma_sge; 4334 ss.sg_list = NULL; 4335 ss.num_sge = 1; 4336 ss.total_len = req->total_len; 4337 rvt_skip_sge(&ss, len, false); 4338 rvt_copy_sge(qp, &ss, packet->payload, pmtu, false, 4339 false); 4340 /* Raise the sw sequence check flag for next packet */ 4341 priv->r_next_psn_kdeth = mask_psn(psn + 1); 4342 priv->s_flags |= HFI1_R_TID_SW_PSN; 4343 } 4344 goto exit; 4345 } 4346 flow->flow_state.r_next_psn = mask_psn(psn + 1); 4347 hfi1_kern_exp_rcv_clear(req); 4348 priv->alloc_w_segs--; 4349 rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK; 4350 req->comp_seg++; 4351 priv->s_nak_state = 0; 4352 4353 /* 4354 * Release the flow if one of the following conditions has been met: 4355 * - The request has reached a sync point AND all outstanding 4356 * segments have been completed, or 4357 * - The entire request is complete and there are no more requests 4358 * (of any kind) in the queue. 4359 */ 4360 trace_hfi1_rsp_rcv_tid_write_data(qp, psn); 4361 trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn, 4362 req); 4363 trace_hfi1_tid_write_rsp_rcv_data(qp); 4364 validate_r_tid_ack(priv); 4365 4366 if (opcode == TID_OP(WRITE_DATA_LAST)) { 4367 release_rdma_sge_mr(e); 4368 for (next = priv->r_tid_tail + 1; ; next++) { 4369 if (next > rvt_size_atomic(&dev->rdi)) 4370 next = 0; 4371 if (next == priv->r_tid_head) 4372 break; 4373 e = &qp->s_ack_queue[next]; 4374 if (e->opcode == TID_OP(WRITE_REQ)) 4375 break; 4376 } 4377 priv->r_tid_tail = next; 4378 if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi)) 4379 qp->s_acked_ack_queue = 0; 4380 } 4381 4382 hfi1_tid_write_alloc_resources(qp, true); 4383 4384 /* 4385 * If we need to generate more responses, schedule the 4386 * send engine. 4387 */ 4388 if (req->cur_seg < req->total_segs || 4389 qp->s_tail_ack_queue != qp->r_head_ack_queue) { 4390 qp->s_flags |= RVT_S_RESP_PENDING; 4391 hfi1_schedule_send(qp); 4392 } 4393 4394 priv->pending_tid_w_segs--; 4395 if (priv->s_flags & HFI1_R_TID_RSC_TIMER) { 4396 if (priv->pending_tid_w_segs) 4397 hfi1_mod_tid_reap_timer(req->qp); 4398 else 4399 hfi1_stop_tid_reap_timer(req->qp); 4400 } 4401 4402done: 4403 tid_rdma_schedule_ack(qp); 4404exit: 4405 priv->r_next_psn_kdeth = flow->flow_state.r_next_psn; 4406 if (fecn) 4407 qp->s_flags |= RVT_S_ECN; 4408 spin_unlock_irqrestore(&qp->s_lock, flags); 4409 return; 4410 4411send_nak: 4412 if (!priv->s_nak_state) { 4413 priv->s_nak_state = IB_NAK_PSN_ERROR; 4414 priv->s_nak_psn = flow->flow_state.r_next_psn; 4415 tid_rdma_trigger_ack(qp); 4416 } 4417 goto done; 4418} 4419 4420static bool hfi1_tid_rdma_is_resync_psn(u32 psn) 4421{ 4422 return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) == 4423 HFI1_KDETH_BTH_SEQ_MASK); 4424} 4425 4426u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, 4427 struct ib_other_headers *ohdr, u16 iflow, 4428 u32 *bth1, u32 *bth2) 4429{ 4430 struct hfi1_qp_priv *qpriv = qp->priv; 4431 struct tid_flow_state *fs = &qpriv->flow_state; 4432 struct tid_rdma_request *req = ack_to_tid_req(e); 4433 struct tid_rdma_flow *flow = &req->flows[iflow]; 4434 struct tid_rdma_params *remote; 4435 4436 rcu_read_lock(); 4437 remote = rcu_dereference(qpriv->tid_rdma.remote); 4438 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey); 4439 ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn); 4440 *bth1 = remote->qp; 4441 rcu_read_unlock(); 4442 4443 if (qpriv->resync) { 4444 *bth2 = mask_psn((fs->generation << 4445 HFI1_KDETH_BTH_SEQ_SHIFT) - 1); 4446 ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp); 4447 } else if (qpriv->s_nak_state) { 4448 *bth2 = mask_psn(qpriv->s_nak_psn); 4449 ohdr->u.tid_rdma.ack.aeth = 4450 cpu_to_be32((qp->r_msn & IB_MSN_MASK) | 4451 (qpriv->s_nak_state << 4452 IB_AETH_CREDIT_SHIFT)); 4453 } else { 4454 *bth2 = full_flow_psn(flow, flow->flow_state.lpsn); 4455 ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp); 4456 } 4457 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1); 4458 ohdr->u.tid_rdma.ack.tid_flow_qp = 4459 cpu_to_be32(qpriv->tid_rdma.local.qp | 4460 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << 4461 TID_RDMA_DESTQP_FLOW_SHIFT) | 4462 qpriv->rcd->ctxt); 4463 4464 ohdr->u.tid_rdma.ack.tid_flow_psn = 0; 4465 ohdr->u.tid_rdma.ack.verbs_psn = 4466 cpu_to_be32(flow->flow_state.resp_ib_psn); 4467 4468 if (qpriv->resync) { 4469 /* 4470 * If the PSN before the current expect KDETH PSN is the 4471 * RESYNC PSN, then we never received a good TID RDMA WRITE 4472 * DATA packet after a previous RESYNC. 4473 * In this case, the next expected KDETH PSN stays the same. 4474 */ 4475 if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) { 4476 ohdr->u.tid_rdma.ack.tid_flow_psn = 4477 cpu_to_be32(qpriv->r_next_psn_kdeth_save); 4478 } else { 4479 /* 4480 * Because the KDETH PSNs jump during a RESYNC, it's 4481 * not possible to infer (or compute) the previous value 4482 * of r_next_psn_kdeth in the case of back-to-back 4483 * RESYNC packets. Therefore, we save it. 4484 */ 4485 qpriv->r_next_psn_kdeth_save = 4486 qpriv->r_next_psn_kdeth - 1; 4487 ohdr->u.tid_rdma.ack.tid_flow_psn = 4488 cpu_to_be32(qpriv->r_next_psn_kdeth_save); 4489 qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1); 4490 } 4491 qpriv->resync = false; 4492 } 4493 4494 return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32); 4495} 4496 4497void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) 4498{ 4499 struct ib_other_headers *ohdr = packet->ohdr; 4500 struct rvt_qp *qp = packet->qp; 4501 struct hfi1_qp_priv *qpriv = qp->priv; 4502 struct rvt_swqe *wqe; 4503 struct tid_rdma_request *req; 4504 struct tid_rdma_flow *flow; 4505 u32 aeth, psn, req_psn, ack_psn, flpsn, resync_psn, ack_kpsn; 4506 unsigned long flags; 4507 u16 fidx; 4508 4509 trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0); 4510 process_ecn(qp, packet); 4511 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4512 aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth); 4513 req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn)); 4514 resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn)); 4515 4516 spin_lock_irqsave(&qp->s_lock, flags); 4517 trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn); 4518 4519 /* If we are waiting for an ACK to RESYNC, drop any other packets */ 4520 if ((qp->s_flags & HFI1_S_WAIT_HALT) && 4521 cmp_psn(psn, qpriv->s_resync_psn)) 4522 goto ack_op_err; 4523 4524 ack_psn = req_psn; 4525 if (hfi1_tid_rdma_is_resync_psn(psn)) 4526 ack_kpsn = resync_psn; 4527 else 4528 ack_kpsn = psn; 4529 if (aeth >> 29) { 4530 ack_psn--; 4531 ack_kpsn--; 4532 } 4533 4534 if (unlikely(qp->s_acked == qp->s_tail)) 4535 goto ack_op_err; 4536 4537 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4538 4539 if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) 4540 goto ack_op_err; 4541 4542 req = wqe_to_tid_req(wqe); 4543 trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, 4544 wqe->lpsn, req); 4545 flow = &req->flows[req->acked_tail]; 4546 trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); 4547 4548 /* Drop stale ACK/NAK */ 4549 if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0 || 4550 cmp_psn(req_psn, flow->flow_state.resp_ib_psn) < 0) 4551 goto ack_op_err; 4552 4553 while (cmp_psn(ack_kpsn, 4554 full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 && 4555 req->ack_seg < req->cur_seg) { 4556 req->ack_seg++; 4557 /* advance acked segment pointer */ 4558 req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS); 4559 req->r_last_acked = flow->flow_state.resp_ib_psn; 4560 trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, 4561 wqe->lpsn, req); 4562 if (req->ack_seg == req->total_segs) { 4563 req->state = TID_REQUEST_COMPLETE; 4564 wqe = do_rc_completion(qp, wqe, 4565 to_iport(qp->ibqp.device, 4566 qp->port_num)); 4567 trace_hfi1_sender_rcv_tid_ack(qp); 4568 atomic_dec(&qpriv->n_tid_requests); 4569 if (qp->s_acked == qp->s_tail) 4570 break; 4571 if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) 4572 break; 4573 req = wqe_to_tid_req(wqe); 4574 } 4575 flow = &req->flows[req->acked_tail]; 4576 trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); 4577 } 4578 4579 trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, 4580 wqe->lpsn, req); 4581 switch (aeth >> 29) { 4582 case 0: /* ACK */ 4583 if (qpriv->s_flags & RVT_S_WAIT_ACK) 4584 qpriv->s_flags &= ~RVT_S_WAIT_ACK; 4585 if (!hfi1_tid_rdma_is_resync_psn(psn)) { 4586 /* Check if there is any pending TID ACK */ 4587 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && 4588 req->ack_seg < req->cur_seg) 4589 hfi1_mod_tid_retry_timer(qp); 4590 else 4591 hfi1_stop_tid_retry_timer(qp); 4592 hfi1_schedule_send(qp); 4593 } else { 4594 u32 spsn, fpsn, last_acked, generation; 4595 struct tid_rdma_request *rptr; 4596 4597 /* ACK(RESYNC) */ 4598 hfi1_stop_tid_retry_timer(qp); 4599 /* Allow new requests (see hfi1_make_tid_rdma_pkt) */ 4600 qp->s_flags &= ~HFI1_S_WAIT_HALT; 4601 /* 4602 * Clear RVT_S_SEND_ONE flag in case that the TID RDMA 4603 * ACK is received after the TID retry timer is fired 4604 * again. In this case, do not send any more TID 4605 * RESYNC request or wait for any more TID ACK packet. 4606 */ 4607 qpriv->s_flags &= ~RVT_S_SEND_ONE; 4608 hfi1_schedule_send(qp); 4609 4610 if ((qp->s_acked == qpriv->s_tid_tail && 4611 req->ack_seg == req->total_segs) || 4612 qp->s_acked == qp->s_tail) { 4613 qpriv->s_state = TID_OP(WRITE_DATA_LAST); 4614 goto done; 4615 } 4616 4617 if (req->ack_seg == req->comp_seg) { 4618 qpriv->s_state = TID_OP(WRITE_DATA); 4619 goto done; 4620 } 4621 4622 /* 4623 * The PSN to start with is the next PSN after the 4624 * RESYNC PSN. 4625 */ 4626 psn = mask_psn(psn + 1); 4627 generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT; 4628 spsn = 0; 4629 4630 /* 4631 * Update to the correct WQE when we get an ACK(RESYNC) 4632 * in the middle of a request. 4633 */ 4634 if (delta_psn(ack_psn, wqe->lpsn)) 4635 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4636 req = wqe_to_tid_req(wqe); 4637 flow = &req->flows[req->acked_tail]; 4638 /* 4639 * RESYNC re-numbers the PSN ranges of all remaining 4640 * segments. Also, PSN's start from 0 in the middle of a 4641 * segment and the first segment size is less than the 4642 * default number of packets. flow->resync_npkts is used 4643 * to track the number of packets from the start of the 4644 * real segment to the point of 0 PSN after the RESYNC 4645 * in order to later correctly rewind the SGE. 4646 */ 4647 fpsn = full_flow_psn(flow, flow->flow_state.spsn); 4648 req->r_ack_psn = psn; 4649 /* 4650 * If resync_psn points to the last flow PSN for a 4651 * segment and the new segment (likely from a new 4652 * request) starts with a new generation number, we 4653 * need to adjust resync_psn accordingly. 4654 */ 4655 if (flow->flow_state.generation != 4656 (resync_psn >> HFI1_KDETH_BTH_SEQ_SHIFT)) 4657 resync_psn = mask_psn(fpsn - 1); 4658 flow->resync_npkts += 4659 delta_psn(mask_psn(resync_psn + 1), fpsn); 4660 /* 4661 * Renumber all packet sequence number ranges 4662 * based on the new generation. 4663 */ 4664 last_acked = qp->s_acked; 4665 rptr = req; 4666 while (1) { 4667 /* start from last acked segment */ 4668 for (fidx = rptr->acked_tail; 4669 CIRC_CNT(rptr->setup_head, fidx, 4670 MAX_FLOWS); 4671 fidx = CIRC_NEXT(fidx, MAX_FLOWS)) { 4672 u32 lpsn; 4673 u32 gen; 4674 4675 flow = &rptr->flows[fidx]; 4676 gen = flow->flow_state.generation; 4677 if (WARN_ON(gen == generation && 4678 flow->flow_state.spsn != 4679 spsn)) 4680 continue; 4681 lpsn = flow->flow_state.lpsn; 4682 lpsn = full_flow_psn(flow, lpsn); 4683 flow->npkts = 4684 delta_psn(lpsn, 4685 mask_psn(resync_psn) 4686 ); 4687 flow->flow_state.generation = 4688 generation; 4689 flow->flow_state.spsn = spsn; 4690 flow->flow_state.lpsn = 4691 flow->flow_state.spsn + 4692 flow->npkts - 1; 4693 flow->pkt = 0; 4694 spsn += flow->npkts; 4695 resync_psn += flow->npkts; 4696 trace_hfi1_tid_flow_rcv_tid_ack(qp, 4697 fidx, 4698 flow); 4699 } 4700 if (++last_acked == qpriv->s_tid_cur + 1) 4701 break; 4702 if (last_acked == qp->s_size) 4703 last_acked = 0; 4704 wqe = rvt_get_swqe_ptr(qp, last_acked); 4705 rptr = wqe_to_tid_req(wqe); 4706 } 4707 req->cur_seg = req->ack_seg; 4708 qpriv->s_tid_tail = qp->s_acked; 4709 qpriv->s_state = TID_OP(WRITE_REQ); 4710 hfi1_schedule_tid_send(qp); 4711 } 4712done: 4713 qpriv->s_retry = qp->s_retry_cnt; 4714 break; 4715 4716 case 3: /* NAK */ 4717 hfi1_stop_tid_retry_timer(qp); 4718 switch ((aeth >> IB_AETH_CREDIT_SHIFT) & 4719 IB_AETH_CREDIT_MASK) { 4720 case 0: /* PSN sequence error */ 4721 if (!req->flows) 4722 break; 4723 flow = &req->flows[req->acked_tail]; 4724 flpsn = full_flow_psn(flow, flow->flow_state.lpsn); 4725 if (cmp_psn(psn, flpsn) > 0) 4726 break; 4727 trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, 4728 flow); 4729 req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4730 req->cur_seg = req->ack_seg; 4731 qpriv->s_tid_tail = qp->s_acked; 4732 qpriv->s_state = TID_OP(WRITE_REQ); 4733 qpriv->s_retry = qp->s_retry_cnt; 4734 hfi1_schedule_tid_send(qp); 4735 break; 4736 4737 default: 4738 break; 4739 } 4740 break; 4741 4742 default: 4743 break; 4744 } 4745 4746ack_op_err: 4747 spin_unlock_irqrestore(&qp->s_lock, flags); 4748} 4749 4750void hfi1_add_tid_retry_timer(struct rvt_qp *qp) 4751{ 4752 struct hfi1_qp_priv *priv = qp->priv; 4753 struct ib_qp *ibqp = &qp->ibqp; 4754 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); 4755 4756 lockdep_assert_held(&qp->s_lock); 4757 if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) { 4758 priv->s_flags |= HFI1_S_TID_RETRY_TIMER; 4759 priv->s_tid_retry_timer.expires = jiffies + 4760 priv->tid_retry_timeout_jiffies + rdi->busy_jiffies; 4761 add_timer(&priv->s_tid_retry_timer); 4762 } 4763} 4764 4765static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp) 4766{ 4767 struct hfi1_qp_priv *priv = qp->priv; 4768 struct ib_qp *ibqp = &qp->ibqp; 4769 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); 4770 4771 lockdep_assert_held(&qp->s_lock); 4772 priv->s_flags |= HFI1_S_TID_RETRY_TIMER; 4773 mod_timer(&priv->s_tid_retry_timer, jiffies + 4774 priv->tid_retry_timeout_jiffies + rdi->busy_jiffies); 4775} 4776 4777static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp) 4778{ 4779 struct hfi1_qp_priv *priv = qp->priv; 4780 int rval = 0; 4781 4782 lockdep_assert_held(&qp->s_lock); 4783 if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { 4784 rval = del_timer(&priv->s_tid_retry_timer); 4785 priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; 4786 } 4787 return rval; 4788} 4789 4790void hfi1_del_tid_retry_timer(struct rvt_qp *qp) 4791{ 4792 struct hfi1_qp_priv *priv = qp->priv; 4793 4794 del_timer_sync(&priv->s_tid_retry_timer); 4795 priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; 4796} 4797 4798static void hfi1_tid_retry_timeout(struct timer_list *t) 4799{ 4800 struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer); 4801 struct rvt_qp *qp = priv->owner; 4802 struct rvt_swqe *wqe; 4803 unsigned long flags; 4804 struct tid_rdma_request *req; 4805 4806 spin_lock_irqsave(&qp->r_lock, flags); 4807 spin_lock(&qp->s_lock); 4808 trace_hfi1_tid_write_sender_retry_timeout(qp, 0); 4809 if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { 4810 hfi1_stop_tid_retry_timer(qp); 4811 if (!priv->s_retry) { 4812 trace_hfi1_msg_tid_retry_timeout(/* msg */ 4813 qp, 4814 "Exhausted retries. Tid retry timeout = ", 4815 (u64)priv->tid_retry_timeout_jiffies); 4816 4817 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4818 hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); 4819 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 4820 } else { 4821 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4822 req = wqe_to_tid_req(wqe); 4823 trace_hfi1_tid_req_tid_retry_timeout(/* req */ 4824 qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req); 4825 4826 priv->s_flags &= ~RVT_S_WAIT_ACK; 4827 /* Only send one packet (the RESYNC) */ 4828 priv->s_flags |= RVT_S_SEND_ONE; 4829 /* 4830 * No additional request shall be made by this QP until 4831 * the RESYNC has been complete. 4832 */ 4833 qp->s_flags |= HFI1_S_WAIT_HALT; 4834 priv->s_state = TID_OP(RESYNC); 4835 priv->s_retry--; 4836 hfi1_schedule_tid_send(qp); 4837 } 4838 } 4839 spin_unlock(&qp->s_lock); 4840 spin_unlock_irqrestore(&qp->r_lock, flags); 4841} 4842 4843u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, 4844 struct ib_other_headers *ohdr, u32 *bth1, 4845 u32 *bth2, u16 fidx) 4846{ 4847 struct hfi1_qp_priv *qpriv = qp->priv; 4848 struct tid_rdma_params *remote; 4849 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 4850 struct tid_rdma_flow *flow = &req->flows[fidx]; 4851 u32 generation; 4852 4853 rcu_read_lock(); 4854 remote = rcu_dereference(qpriv->tid_rdma.remote); 4855 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey); 4856 ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn); 4857 *bth1 = remote->qp; 4858 rcu_read_unlock(); 4859 4860 generation = kern_flow_generation_next(flow->flow_state.generation); 4861 *bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1); 4862 qpriv->s_resync_psn = *bth2; 4863 *bth2 |= IB_BTH_REQ_ACK; 4864 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1); 4865 4866 return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32); 4867} 4868 4869void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) 4870{ 4871 struct ib_other_headers *ohdr = packet->ohdr; 4872 struct rvt_qp *qp = packet->qp; 4873 struct hfi1_qp_priv *qpriv = qp->priv; 4874 struct hfi1_ctxtdata *rcd = qpriv->rcd; 4875 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 4876 struct rvt_ack_entry *e; 4877 struct tid_rdma_request *req; 4878 struct tid_rdma_flow *flow; 4879 struct tid_flow_state *fs = &qpriv->flow_state; 4880 u32 psn, generation, idx, gen_next; 4881 bool fecn; 4882 unsigned long flags; 4883 4884 fecn = process_ecn(qp, packet); 4885 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4886 4887 generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT; 4888 spin_lock_irqsave(&qp->s_lock, flags); 4889 4890 gen_next = (fs->generation == KERN_GENERATION_RESERVED) ? 4891 generation : kern_flow_generation_next(fs->generation); 4892 /* 4893 * RESYNC packet contains the "next" generation and can only be 4894 * from the current or previous generations 4895 */ 4896 if (generation != mask_generation(gen_next - 1) && 4897 generation != gen_next) 4898 goto bail; 4899 /* Already processing a resync */ 4900 if (qpriv->resync) 4901 goto bail; 4902 4903 spin_lock(&rcd->exp_lock); 4904 if (fs->index >= RXE_NUM_TID_FLOWS) { 4905 /* 4906 * If we don't have a flow, save the generation so it can be 4907 * applied when a new flow is allocated 4908 */ 4909 fs->generation = generation; 4910 } else { 4911 /* Reprogram the QP flow with new generation */ 4912 rcd->flows[fs->index].generation = generation; 4913 fs->generation = kern_setup_hw_flow(rcd, fs->index); 4914 } 4915 fs->psn = 0; 4916 /* 4917 * Disable SW PSN checking since a RESYNC is equivalent to a 4918 * sync point and the flow has/will be reprogrammed 4919 */ 4920 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 4921 trace_hfi1_tid_write_rsp_rcv_resync(qp); 4922 4923 /* 4924 * Reset all TID flow information with the new generation. 4925 * This is done for all requests and segments after the 4926 * last received segment 4927 */ 4928 for (idx = qpriv->r_tid_tail; ; idx++) { 4929 u16 flow_idx; 4930 4931 if (idx > rvt_size_atomic(&dev->rdi)) 4932 idx = 0; 4933 e = &qp->s_ack_queue[idx]; 4934 if (e->opcode == TID_OP(WRITE_REQ)) { 4935 req = ack_to_tid_req(e); 4936 trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn, 4937 e->lpsn, req); 4938 4939 /* start from last unacked segment */ 4940 for (flow_idx = req->clear_tail; 4941 CIRC_CNT(req->setup_head, flow_idx, 4942 MAX_FLOWS); 4943 flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) { 4944 u32 lpsn; 4945 u32 next; 4946 4947 flow = &req->flows[flow_idx]; 4948 lpsn = full_flow_psn(flow, 4949 flow->flow_state.lpsn); 4950 next = flow->flow_state.r_next_psn; 4951 flow->npkts = delta_psn(lpsn, next - 1); 4952 flow->flow_state.generation = fs->generation; 4953 flow->flow_state.spsn = fs->psn; 4954 flow->flow_state.lpsn = 4955 flow->flow_state.spsn + flow->npkts - 1; 4956 flow->flow_state.r_next_psn = 4957 full_flow_psn(flow, 4958 flow->flow_state.spsn); 4959 fs->psn += flow->npkts; 4960 trace_hfi1_tid_flow_rcv_resync(qp, flow_idx, 4961 flow); 4962 } 4963 } 4964 if (idx == qp->s_tail_ack_queue) 4965 break; 4966 } 4967 4968 spin_unlock(&rcd->exp_lock); 4969 qpriv->resync = true; 4970 /* RESYNC request always gets a TID RDMA ACK. */ 4971 qpriv->s_nak_state = 0; 4972 tid_rdma_trigger_ack(qp); 4973bail: 4974 if (fecn) 4975 qp->s_flags |= RVT_S_ECN; 4976 spin_unlock_irqrestore(&qp->s_lock, flags); 4977} 4978 4979/* 4980 * Call this function when the last TID RDMA WRITE DATA packet for a request 4981 * is built. 4982 */ 4983static void update_tid_tail(struct rvt_qp *qp) 4984 __must_hold(&qp->s_lock) 4985{ 4986 struct hfi1_qp_priv *priv = qp->priv; 4987 u32 i; 4988 struct rvt_swqe *wqe; 4989 4990 lockdep_assert_held(&qp->s_lock); 4991 /* Can't move beyond s_tid_cur */ 4992 if (priv->s_tid_tail == priv->s_tid_cur) 4993 return; 4994 for (i = priv->s_tid_tail + 1; ; i++) { 4995 if (i == qp->s_size) 4996 i = 0; 4997 4998 if (i == priv->s_tid_cur) 4999 break; 5000 wqe = rvt_get_swqe_ptr(qp, i); 5001 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) 5002 break; 5003 } 5004 priv->s_tid_tail = i; 5005 priv->s_state = TID_OP(WRITE_RESP); 5006} 5007 5008int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) 5009 __must_hold(&qp->s_lock) 5010{ 5011 struct hfi1_qp_priv *priv = qp->priv; 5012 struct rvt_swqe *wqe; 5013 u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0; 5014 struct ib_other_headers *ohdr; 5015 struct rvt_sge_state *ss = &qp->s_sge; 5016 struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 5017 struct tid_rdma_request *req = ack_to_tid_req(e); 5018 bool last = false; 5019 u8 opcode = TID_OP(WRITE_DATA); 5020 5021 lockdep_assert_held(&qp->s_lock); 5022 trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); 5023 /* 5024 * Prioritize the sending of the requests and responses over the 5025 * sending of the TID RDMA data packets. 5026 */ 5027 if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) && 5028 atomic_read(&priv->n_requests) && 5029 !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK | 5030 HFI1_S_ANY_WAIT_IO))) || 5031 (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg && 5032 !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) { 5033 struct iowait_work *iowork; 5034 5035 iowork = iowait_get_ib_work(&priv->s_iowait); 5036 ps->s_txreq = get_waiting_verbs_txreq(iowork); 5037 if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) { 5038 priv->s_flags |= HFI1_S_TID_BUSY_SET; 5039 return 1; 5040 } 5041 } 5042 5043 ps->s_txreq = get_txreq(ps->dev, qp); 5044 if (!ps->s_txreq) 5045 goto bail_no_tx; 5046 5047 ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; 5048 5049 if ((priv->s_flags & RVT_S_ACK_PENDING) && 5050 make_tid_rdma_ack(qp, ohdr, ps)) 5051 return 1; 5052 5053 /* 5054 * Bail out if we can't send data. 5055 * Be reminded that this check must been done after the call to 5056 * make_tid_rdma_ack() because the responding QP could be in 5057 * RTR state where it can send TID RDMA ACK, not TID RDMA WRITE DATA. 5058 */ 5059 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) 5060 goto bail; 5061 5062 if (priv->s_flags & RVT_S_WAIT_ACK) 5063 goto bail; 5064 5065 /* Check whether there is anything to do. */ 5066 if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) 5067 goto bail; 5068 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); 5069 req = wqe_to_tid_req(wqe); 5070 trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn, 5071 wqe->lpsn, req); 5072 switch (priv->s_state) { 5073 case TID_OP(WRITE_REQ): 5074 case TID_OP(WRITE_RESP): 5075 priv->tid_ss.sge = wqe->sg_list[0]; 5076 priv->tid_ss.sg_list = wqe->sg_list + 1; 5077 priv->tid_ss.num_sge = wqe->wr.num_sge; 5078 priv->tid_ss.total_len = wqe->length; 5079 5080 if (priv->s_state == TID_OP(WRITE_REQ)) 5081 hfi1_tid_rdma_restart_req(qp, wqe, &bth2); 5082 priv->s_state = TID_OP(WRITE_DATA); 5083 fallthrough; 5084 5085 case TID_OP(WRITE_DATA): 5086 /* 5087 * 1. Check whether TID RDMA WRITE RESP available. 5088 * 2. If no: 5089 * 2.1 If have more segments and no TID RDMA WRITE RESP, 5090 * set HFI1_S_WAIT_TID_RESP 5091 * 2.2 Return indicating no progress made. 5092 * 3. If yes: 5093 * 3.1 Build TID RDMA WRITE DATA packet. 5094 * 3.2 If last packet in segment: 5095 * 3.2.1 Change KDETH header bits 5096 * 3.2.2 Advance RESP pointers. 5097 * 3.3 Return indicating progress made. 5098 */ 5099 trace_hfi1_sender_make_tid_pkt(qp); 5100 trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); 5101 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); 5102 req = wqe_to_tid_req(wqe); 5103 len = wqe->length; 5104 5105 if (!req->comp_seg || req->cur_seg == req->comp_seg) 5106 goto bail; 5107 5108 trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, 5109 wqe->psn, wqe->lpsn, req); 5110 last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2, 5111 &len); 5112 5113 if (last) { 5114 /* move pointer to next flow */ 5115 req->clear_tail = CIRC_NEXT(req->clear_tail, 5116 MAX_FLOWS); 5117 if (++req->cur_seg < req->total_segs) { 5118 if (!CIRC_CNT(req->setup_head, req->clear_tail, 5119 MAX_FLOWS)) 5120 qp->s_flags |= HFI1_S_WAIT_TID_RESP; 5121 } else { 5122 priv->s_state = TID_OP(WRITE_DATA_LAST); 5123 opcode = TID_OP(WRITE_DATA_LAST); 5124 5125 /* Advance the s_tid_tail now */ 5126 update_tid_tail(qp); 5127 } 5128 } 5129 hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32); 5130 ss = &priv->tid_ss; 5131 break; 5132 5133 case TID_OP(RESYNC): 5134 trace_hfi1_sender_make_tid_pkt(qp); 5135 /* Use generation from the most recently received response */ 5136 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur); 5137 req = wqe_to_tid_req(wqe); 5138 /* If no responses for this WQE look at the previous one */ 5139 if (!req->comp_seg) { 5140 wqe = rvt_get_swqe_ptr(qp, 5141 (!priv->s_tid_cur ? qp->s_size : 5142 priv->s_tid_cur) - 1); 5143 req = wqe_to_tid_req(wqe); 5144 } 5145 hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1, 5146 &bth2, 5147 CIRC_PREV(req->setup_head, 5148 MAX_FLOWS)); 5149 ss = NULL; 5150 len = 0; 5151 opcode = TID_OP(RESYNC); 5152 break; 5153 5154 default: 5155 goto bail; 5156 } 5157 if (priv->s_flags & RVT_S_SEND_ONE) { 5158 priv->s_flags &= ~RVT_S_SEND_ONE; 5159 priv->s_flags |= RVT_S_WAIT_ACK; 5160 bth2 |= IB_BTH_REQ_ACK; 5161 } 5162 qp->s_len -= len; 5163 ps->s_txreq->hdr_dwords = hwords; 5164 ps->s_txreq->sde = priv->s_sde; 5165 ps->s_txreq->ss = ss; 5166 ps->s_txreq->s_cur_size = len; 5167 hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2, 5168 middle, ps); 5169 return 1; 5170bail: 5171 hfi1_put_txreq(ps->s_txreq); 5172bail_no_tx: 5173 ps->s_txreq = NULL; 5174 priv->s_flags &= ~RVT_S_BUSY; 5175 /* 5176 * If we didn't get a txreq, the QP will be woken up later to try 5177 * again, set the flags to the wake up which work item to wake 5178 * up. 5179 * (A better algorithm should be found to do this and generalize the 5180 * sleep/wakeup flags.) 5181 */ 5182 iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); 5183 return 0; 5184} 5185 5186static int make_tid_rdma_ack(struct rvt_qp *qp, 5187 struct ib_other_headers *ohdr, 5188 struct hfi1_pkt_state *ps) 5189{ 5190 struct rvt_ack_entry *e; 5191 struct hfi1_qp_priv *qpriv = qp->priv; 5192 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 5193 u32 hwords, next; 5194 u32 len = 0; 5195 u32 bth1 = 0, bth2 = 0; 5196 int middle = 0; 5197 u16 flow; 5198 struct tid_rdma_request *req, *nreq; 5199 5200 trace_hfi1_tid_write_rsp_make_tid_ack(qp); 5201 /* Don't send an ACK if we aren't supposed to. */ 5202 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) 5203 goto bail; 5204 5205 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 5206 hwords = 5; 5207 5208 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5209 req = ack_to_tid_req(e); 5210 /* 5211 * In the RESYNC case, we are exactly one segment past the 5212 * previously sent ack or at the previously sent NAK. So to send 5213 * the resync ack, we go back one segment (which might be part of 5214 * the previous request) and let the do-while loop execute again. 5215 * The advantage of executing the do-while loop is that any data 5216 * received after the previous ack is automatically acked in the 5217 * RESYNC ack. It turns out that for the do-while loop we only need 5218 * to pull back qpriv->r_tid_ack, not the segment 5219 * indices/counters. The scheme works even if the previous request 5220 * was not a TID WRITE request. 5221 */ 5222 if (qpriv->resync) { 5223 if (!req->ack_seg || req->ack_seg == req->total_segs) 5224 qpriv->r_tid_ack = !qpriv->r_tid_ack ? 5225 rvt_size_atomic(&dev->rdi) : 5226 qpriv->r_tid_ack - 1; 5227 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5228 req = ack_to_tid_req(e); 5229 } 5230 5231 trace_hfi1_rsp_make_tid_ack(qp, e->psn); 5232 trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, 5233 req); 5234 /* 5235 * If we've sent all the ACKs that we can, we are done 5236 * until we get more segments... 5237 */ 5238 if (!qpriv->s_nak_state && !qpriv->resync && 5239 req->ack_seg == req->comp_seg) 5240 goto bail; 5241 5242 do { 5243 /* 5244 * To deal with coalesced ACKs, the acked_tail pointer 5245 * into the flow array is used. The distance between it 5246 * and the clear_tail is the number of flows that are 5247 * being ACK'ed. 5248 */ 5249 req->ack_seg += 5250 /* Get up-to-date value */ 5251 CIRC_CNT(req->clear_tail, req->acked_tail, 5252 MAX_FLOWS); 5253 /* Advance acked index */ 5254 req->acked_tail = req->clear_tail; 5255 5256 /* 5257 * req->clear_tail points to the segment currently being 5258 * received. So, when sending an ACK, the previous 5259 * segment is being ACK'ed. 5260 */ 5261 flow = CIRC_PREV(req->acked_tail, MAX_FLOWS); 5262 if (req->ack_seg != req->total_segs) 5263 break; 5264 req->state = TID_REQUEST_COMPLETE; 5265 5266 next = qpriv->r_tid_ack + 1; 5267 if (next > rvt_size_atomic(&dev->rdi)) 5268 next = 0; 5269 qpriv->r_tid_ack = next; 5270 if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ)) 5271 break; 5272 nreq = ack_to_tid_req(&qp->s_ack_queue[next]); 5273 if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg) 5274 break; 5275 5276 /* Move to the next ack entry now */ 5277 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5278 req = ack_to_tid_req(e); 5279 } while (1); 5280 5281 /* 5282 * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and 5283 * req could be pointing at the previous ack queue entry 5284 */ 5285 if (qpriv->s_nak_state || 5286 (qpriv->resync && 5287 !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) && 5288 (cmp_psn(qpriv->r_next_psn_kdeth - 1, 5289 full_flow_psn(&req->flows[flow], 5290 req->flows[flow].flow_state.lpsn)) > 0))) { 5291 /* 5292 * A NAK will implicitly acknowledge all previous TID RDMA 5293 * requests. Therefore, we NAK with the req->acked_tail 5294 * segment for the request at qpriv->r_tid_ack (same at 5295 * this point as the req->clear_tail segment for the 5296 * qpriv->r_tid_tail request) 5297 */ 5298 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5299 req = ack_to_tid_req(e); 5300 flow = req->acked_tail; 5301 } else if (req->ack_seg == req->total_segs && 5302 qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) 5303 qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; 5304 5305 trace_hfi1_tid_write_rsp_make_tid_ack(qp); 5306 trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, 5307 req); 5308 hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1, 5309 &bth2); 5310 len = 0; 5311 qpriv->s_flags &= ~RVT_S_ACK_PENDING; 5312 ps->s_txreq->hdr_dwords = hwords; 5313 ps->s_txreq->sde = qpriv->s_sde; 5314 ps->s_txreq->s_cur_size = len; 5315 ps->s_txreq->ss = NULL; 5316 hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle, 5317 ps); 5318 ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; 5319 return 1; 5320bail: 5321 /* 5322 * Ensure s_rdma_ack_cnt changes are committed prior to resetting 5323 * RVT_S_RESP_PENDING 5324 */ 5325 smp_wmb(); 5326 qpriv->s_flags &= ~RVT_S_ACK_PENDING; 5327 return 0; 5328} 5329 5330static int hfi1_send_tid_ok(struct rvt_qp *qp) 5331{ 5332 struct hfi1_qp_priv *priv = qp->priv; 5333 5334 return !(priv->s_flags & RVT_S_BUSY || 5335 qp->s_flags & HFI1_S_ANY_WAIT_IO) && 5336 (verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) || 5337 (priv->s_flags & RVT_S_RESP_PENDING) || 5338 !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND)); 5339} 5340 5341void _hfi1_do_tid_send(struct work_struct *work) 5342{ 5343 struct iowait_work *w = container_of(work, struct iowait_work, iowork); 5344 struct rvt_qp *qp = iowait_to_qp(w->iow); 5345 5346 hfi1_do_tid_send(qp); 5347} 5348 5349static void hfi1_do_tid_send(struct rvt_qp *qp) 5350{ 5351 struct hfi1_pkt_state ps; 5352 struct hfi1_qp_priv *priv = qp->priv; 5353 5354 ps.dev = to_idev(qp->ibqp.device); 5355 ps.ibp = to_iport(qp->ibqp.device, qp->port_num); 5356 ps.ppd = ppd_from_ibp(ps.ibp); 5357 ps.wait = iowait_get_tid_work(&priv->s_iowait); 5358 ps.in_thread = false; 5359 ps.timeout_int = qp->timeout_jiffies / 8; 5360 5361 trace_hfi1_rc_do_tid_send(qp, false); 5362 spin_lock_irqsave(&qp->s_lock, ps.flags); 5363 5364 /* Return if we are already busy processing a work request. */ 5365 if (!hfi1_send_tid_ok(qp)) { 5366 if (qp->s_flags & HFI1_S_ANY_WAIT_IO) 5367 iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); 5368 spin_unlock_irqrestore(&qp->s_lock, ps.flags); 5369 return; 5370 } 5371 5372 priv->s_flags |= RVT_S_BUSY; 5373 5374 ps.timeout = jiffies + ps.timeout_int; 5375 ps.cpu = priv->s_sde ? priv->s_sde->cpu : 5376 cpumask_first(cpumask_of_node(ps.ppd->dd->node)); 5377 ps.pkts_sent = false; 5378 5379 /* insure a pre-built packet is handled */ 5380 ps.s_txreq = get_waiting_verbs_txreq(ps.wait); 5381 do { 5382 /* Check for a constructed packet to be sent. */ 5383 if (ps.s_txreq) { 5384 if (priv->s_flags & HFI1_S_TID_BUSY_SET) { 5385 qp->s_flags |= RVT_S_BUSY; 5386 ps.wait = iowait_get_ib_work(&priv->s_iowait); 5387 } 5388 spin_unlock_irqrestore(&qp->s_lock, ps.flags); 5389 5390 /* 5391 * If the packet cannot be sent now, return and 5392 * the send tasklet will be woken up later. 5393 */ 5394 if (hfi1_verbs_send(qp, &ps)) 5395 return; 5396 5397 /* allow other tasks to run */ 5398 if (hfi1_schedule_send_yield(qp, &ps, true)) 5399 return; 5400 5401 spin_lock_irqsave(&qp->s_lock, ps.flags); 5402 if (priv->s_flags & HFI1_S_TID_BUSY_SET) { 5403 qp->s_flags &= ~RVT_S_BUSY; 5404 priv->s_flags &= ~HFI1_S_TID_BUSY_SET; 5405 ps.wait = iowait_get_tid_work(&priv->s_iowait); 5406 if (iowait_flag_set(&priv->s_iowait, 5407 IOWAIT_PENDING_IB)) 5408 hfi1_schedule_send(qp); 5409 } 5410 } 5411 } while (hfi1_make_tid_rdma_pkt(qp, &ps)); 5412 iowait_starve_clear(ps.pkts_sent, &priv->s_iowait); 5413 spin_unlock_irqrestore(&qp->s_lock, ps.flags); 5414} 5415 5416static bool _hfi1_schedule_tid_send(struct rvt_qp *qp) 5417{ 5418 struct hfi1_qp_priv *priv = qp->priv; 5419 struct hfi1_ibport *ibp = 5420 to_iport(qp->ibqp.device, qp->port_num); 5421 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 5422 struct hfi1_devdata *dd = ppd->dd; 5423 5424 if ((dd->flags & HFI1_SHUTDOWN)) 5425 return true; 5426 5427 return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq, 5428 priv->s_sde ? 5429 priv->s_sde->cpu : 5430 cpumask_first(cpumask_of_node(dd->node))); 5431} 5432 5433/** 5434 * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine 5435 * @qp: the QP 5436 * 5437 * This schedules qp progress on the TID RDMA state machine. Caller 5438 * should hold the s_lock. 5439 * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because 5440 * the two state machines can step on each other with respect to the 5441 * RVT_S_BUSY flag. 5442 * Therefore, a modified test is used. 5443 * @return true if the second leg is scheduled; 5444 * false if the second leg is not scheduled. 5445 */ 5446bool hfi1_schedule_tid_send(struct rvt_qp *qp) 5447{ 5448 lockdep_assert_held(&qp->s_lock); 5449 if (hfi1_send_tid_ok(qp)) { 5450 /* 5451 * The following call returns true if the qp is not on the 5452 * queue and false if the qp is already on the queue before 5453 * this call. Either way, the qp will be on the queue when the 5454 * call returns. 5455 */ 5456 _hfi1_schedule_tid_send(qp); 5457 return true; 5458 } 5459 if (qp->s_flags & HFI1_S_ANY_WAIT_IO) 5460 iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait, 5461 IOWAIT_PENDING_TID); 5462 return false; 5463} 5464 5465bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e) 5466{ 5467 struct rvt_ack_entry *prev; 5468 struct tid_rdma_request *req; 5469 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 5470 struct hfi1_qp_priv *priv = qp->priv; 5471 u32 s_prev; 5472 5473 s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) : 5474 (qp->s_tail_ack_queue - 1); 5475 prev = &qp->s_ack_queue[s_prev]; 5476 5477 if ((e->opcode == TID_OP(READ_REQ) || 5478 e->opcode == OP(RDMA_READ_REQUEST)) && 5479 prev->opcode == TID_OP(WRITE_REQ)) { 5480 req = ack_to_tid_req(prev); 5481 if (req->ack_seg != req->total_segs) { 5482 priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK; 5483 return true; 5484 } 5485 } 5486 return false; 5487} 5488 5489static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx) 5490{ 5491 u64 reg; 5492 5493 /* 5494 * The only sane way to get the amount of 5495 * progress is to read the HW flow state. 5496 */ 5497 reg = read_uctxt_csr(dd, ctxt, RCV_TID_FLOW_TABLE + (8 * fidx)); 5498 return mask_psn(reg); 5499} 5500 5501static void tid_rdma_rcv_err(struct hfi1_packet *packet, 5502 struct ib_other_headers *ohdr, 5503 struct rvt_qp *qp, u32 psn, int diff, bool fecn) 5504{ 5505 unsigned long flags; 5506 5507 tid_rdma_rcv_error(packet, ohdr, qp, psn, diff); 5508 if (fecn) { 5509 spin_lock_irqsave(&qp->s_lock, flags); 5510 qp->s_flags |= RVT_S_ECN; 5511 spin_unlock_irqrestore(&qp->s_lock, flags); 5512 } 5513} 5514 5515static void update_r_next_psn_fecn(struct hfi1_packet *packet, 5516 struct hfi1_qp_priv *priv, 5517 struct hfi1_ctxtdata *rcd, 5518 struct tid_rdma_flow *flow, 5519 bool fecn) 5520{ 5521 /* 5522 * If a start/middle packet is delivered here due to 5523 * RSM rule and FECN, we need to update the r_next_psn. 5524 */ 5525 if (fecn && packet->etype == RHF_RCV_TYPE_EAGER && 5526 !(priv->s_flags & HFI1_R_TID_SW_PSN)) { 5527 struct hfi1_devdata *dd = rcd->dd; 5528 5529 flow->flow_state.r_next_psn = 5530 read_r_next_psn(dd, rcd->ctxt, flow->idx); 5531 } 5532}