rdma_backend.c (41105B)
1/* 2 * QEMU paravirtual RDMA - Generic RDMA backend 3 * 4 * Copyright (C) 2018 Oracle 5 * Copyright (C) 2018 Red Hat Inc 6 * 7 * Authors: 8 * Yuval Shaia <yuval.shaia@oracle.com> 9 * Marcel Apfelbaum <marcel@redhat.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 * 14 */ 15 16#include "qemu/osdep.h" 17#include "qapi/qapi-events-rdma.h" 18 19#include <infiniband/verbs.h> 20 21#include "contrib/rdmacm-mux/rdmacm-mux.h" 22#include "trace.h" 23#include "rdma_utils.h" 24#include "rdma_rm.h" 25#include "rdma_backend.h" 26 27#define THR_NAME_LEN 16 28#define THR_POLL_TO 5000 29 30#define MAD_HDR_SIZE sizeof(struct ibv_grh) 31 32typedef struct BackendCtx { 33 void *up_ctx; 34 struct ibv_sge sge; /* Used to save MAD recv buffer */ 35 RdmaBackendQP *backend_qp; /* To maintain recv buffers */ 36 RdmaBackendSRQ *backend_srq; 37} BackendCtx; 38 39struct backend_umad { 40 struct ib_user_mad hdr; 41 char mad[RDMA_MAX_PRIVATE_DATA]; 42}; 43 44static void (*comp_handler)(void *ctx, struct ibv_wc *wc); 45 46static void dummy_comp_handler(void *ctx, struct ibv_wc *wc) 47{ 48 rdma_error_report("No completion handler is registered"); 49} 50 51static inline void complete_work(enum ibv_wc_status status, uint32_t vendor_err, 52 void *ctx) 53{ 54 struct ibv_wc wc = {}; 55 56 wc.status = status; 57 wc.vendor_err = vendor_err; 58 59 comp_handler(ctx, &wc); 60} 61 62static void free_cqe_ctx(gpointer data, gpointer user_data) 63{ 64 BackendCtx *bctx; 65 RdmaDeviceResources *rdma_dev_res = user_data; 66 unsigned long cqe_ctx_id = GPOINTER_TO_INT(data); 67 68 bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, cqe_ctx_id); 69 if (bctx) { 70 rdma_rm_dealloc_cqe_ctx(rdma_dev_res, cqe_ctx_id); 71 qatomic_dec(&rdma_dev_res->stats.missing_cqe); 72 } 73 g_free(bctx); 74} 75 76static void clean_recv_mads(RdmaBackendDev *backend_dev) 77{ 78 unsigned long cqe_ctx_id; 79 80 do { 81 cqe_ctx_id = rdma_protected_gqueue_pop_int64(&backend_dev-> 82 recv_mads_list); 83 if (cqe_ctx_id != -ENOENT) { 84 qatomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe); 85 free_cqe_ctx(GINT_TO_POINTER(cqe_ctx_id), 86 backend_dev->rdma_dev_res); 87 } 88 } while (cqe_ctx_id != -ENOENT); 89} 90 91static int rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) 92{ 93 int i, ne, total_ne = 0; 94 BackendCtx *bctx; 95 struct ibv_wc wc[2]; 96 RdmaProtectedGSList *cqe_ctx_list; 97 98 WITH_QEMU_LOCK_GUARD(&rdma_dev_res->lock) { 99 do { 100 ne = ibv_poll_cq(ibcq, ARRAY_SIZE(wc), wc); 101 102 trace_rdma_poll_cq(ne, ibcq); 103 104 for (i = 0; i < ne; i++) { 105 bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, wc[i].wr_id); 106 if (unlikely(!bctx)) { 107 rdma_error_report("No matching ctx for req %"PRId64, 108 wc[i].wr_id); 109 continue; 110 } 111 112 comp_handler(bctx->up_ctx, &wc[i]); 113 114 if (bctx->backend_qp) { 115 cqe_ctx_list = &bctx->backend_qp->cqe_ctx_list; 116 } else { 117 cqe_ctx_list = &bctx->backend_srq->cqe_ctx_list; 118 } 119 120 rdma_protected_gslist_remove_int32(cqe_ctx_list, wc[i].wr_id); 121 rdma_rm_dealloc_cqe_ctx(rdma_dev_res, wc[i].wr_id); 122 g_free(bctx); 123 } 124 total_ne += ne; 125 } while (ne > 0); 126 qatomic_sub(&rdma_dev_res->stats.missing_cqe, total_ne); 127 } 128 129 if (ne < 0) { 130 rdma_error_report("ibv_poll_cq fail, rc=%d, errno=%d", ne, errno); 131 } 132 133 rdma_dev_res->stats.completions += total_ne; 134 135 return total_ne; 136} 137 138static void *comp_handler_thread(void *arg) 139{ 140 RdmaBackendDev *backend_dev = (RdmaBackendDev *)arg; 141 int rc; 142 struct ibv_cq *ev_cq; 143 void *ev_ctx; 144 int flags; 145 GPollFD pfds[1]; 146 147 /* Change to non-blocking mode */ 148 flags = fcntl(backend_dev->channel->fd, F_GETFL); 149 rc = fcntl(backend_dev->channel->fd, F_SETFL, flags | O_NONBLOCK); 150 if (rc < 0) { 151 rdma_error_report("Failed to change backend channel FD to non-blocking"); 152 return NULL; 153 } 154 155 pfds[0].fd = backend_dev->channel->fd; 156 pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR; 157 158 backend_dev->comp_thread.is_running = true; 159 160 while (backend_dev->comp_thread.run) { 161 do { 162 rc = qemu_poll_ns(pfds, 1, THR_POLL_TO * (int64_t)SCALE_MS); 163 if (!rc) { 164 backend_dev->rdma_dev_res->stats.poll_cq_ppoll_to++; 165 } 166 } while (!rc && backend_dev->comp_thread.run); 167 168 if (backend_dev->comp_thread.run) { 169 rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx); 170 if (unlikely(rc)) { 171 rdma_error_report("ibv_get_cq_event fail, rc=%d, errno=%d", rc, 172 errno); 173 continue; 174 } 175 176 rc = ibv_req_notify_cq(ev_cq, 0); 177 if (unlikely(rc)) { 178 rdma_error_report("ibv_req_notify_cq fail, rc=%d, errno=%d", rc, 179 errno); 180 } 181 182 backend_dev->rdma_dev_res->stats.poll_cq_from_bk++; 183 rdma_poll_cq(backend_dev->rdma_dev_res, ev_cq); 184 185 ibv_ack_cq_events(ev_cq, 1); 186 } 187 } 188 189 backend_dev->comp_thread.is_running = false; 190 191 qemu_thread_exit(0); 192 193 return NULL; 194} 195 196static inline void disable_rdmacm_mux_async(RdmaBackendDev *backend_dev) 197{ 198 qatomic_set(&backend_dev->rdmacm_mux.can_receive, 0); 199} 200 201static inline void enable_rdmacm_mux_async(RdmaBackendDev *backend_dev) 202{ 203 qatomic_set(&backend_dev->rdmacm_mux.can_receive, sizeof(RdmaCmMuxMsg)); 204} 205 206static inline int rdmacm_mux_can_process_async(RdmaBackendDev *backend_dev) 207{ 208 return qatomic_read(&backend_dev->rdmacm_mux.can_receive); 209} 210 211static int rdmacm_mux_check_op_status(CharBackend *mad_chr_be) 212{ 213 RdmaCmMuxMsg msg = {}; 214 int ret; 215 216 ret = qemu_chr_fe_read_all(mad_chr_be, (uint8_t *)&msg, sizeof(msg)); 217 if (ret != sizeof(msg)) { 218 rdma_error_report("Got invalid message from mux: size %d, expecting %d", 219 ret, (int)sizeof(msg)); 220 return -EIO; 221 } 222 223 trace_rdmacm_mux_check_op_status(msg.hdr.msg_type, msg.hdr.op_code, 224 msg.hdr.err_code); 225 226 if (msg.hdr.msg_type != RDMACM_MUX_MSG_TYPE_RESP) { 227 rdma_error_report("Got invalid message type %d", msg.hdr.msg_type); 228 return -EIO; 229 } 230 231 if (msg.hdr.err_code != RDMACM_MUX_ERR_CODE_OK) { 232 rdma_error_report("Operation failed in mux, error code %d", 233 msg.hdr.err_code); 234 return -EIO; 235 } 236 237 return 0; 238} 239 240static int rdmacm_mux_send(RdmaBackendDev *backend_dev, RdmaCmMuxMsg *msg) 241{ 242 int rc = 0; 243 244 msg->hdr.msg_type = RDMACM_MUX_MSG_TYPE_REQ; 245 trace_rdmacm_mux("send", msg->hdr.msg_type, msg->hdr.op_code); 246 disable_rdmacm_mux_async(backend_dev); 247 rc = qemu_chr_fe_write(backend_dev->rdmacm_mux.chr_be, 248 (const uint8_t *)msg, sizeof(*msg)); 249 if (rc != sizeof(*msg)) { 250 enable_rdmacm_mux_async(backend_dev); 251 rdma_error_report("Failed to send request to rdmacm_mux (rc=%d)", rc); 252 return -EIO; 253 } 254 255 rc = rdmacm_mux_check_op_status(backend_dev->rdmacm_mux.chr_be); 256 if (rc) { 257 rdma_error_report("Failed to execute rdmacm_mux request %d (rc=%d)", 258 msg->hdr.op_code, rc); 259 } 260 261 enable_rdmacm_mux_async(backend_dev); 262 263 return 0; 264} 265 266static void stop_backend_thread(RdmaBackendThread *thread) 267{ 268 thread->run = false; 269 while (thread->is_running) { 270 sleep(THR_POLL_TO / SCALE_US / 2); 271 } 272} 273 274static void start_comp_thread(RdmaBackendDev *backend_dev) 275{ 276 char thread_name[THR_NAME_LEN] = {}; 277 278 stop_backend_thread(&backend_dev->comp_thread); 279 280 snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s", 281 ibv_get_device_name(backend_dev->ib_dev)); 282 backend_dev->comp_thread.run = true; 283 qemu_thread_create(&backend_dev->comp_thread.thread, thread_name, 284 comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED); 285} 286 287void rdma_backend_register_comp_handler(void (*handler)(void *ctx, 288 struct ibv_wc *wc)) 289{ 290 comp_handler = handler; 291} 292 293void rdma_backend_unregister_comp_handler(void) 294{ 295 rdma_backend_register_comp_handler(dummy_comp_handler); 296} 297 298int rdma_backend_query_port(RdmaBackendDev *backend_dev, 299 struct ibv_port_attr *port_attr) 300{ 301 int rc; 302 303 rc = ibv_query_port(backend_dev->context, backend_dev->port_num, port_attr); 304 if (rc) { 305 rdma_error_report("ibv_query_port fail, rc=%d, errno=%d", rc, errno); 306 return -EIO; 307 } 308 309 return 0; 310} 311 312void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq) 313{ 314 int polled; 315 316 rdma_dev_res->stats.poll_cq_from_guest++; 317 polled = rdma_poll_cq(rdma_dev_res, cq->ibcq); 318 if (!polled) { 319 rdma_dev_res->stats.poll_cq_from_guest_empty++; 320 } 321} 322 323static GHashTable *ah_hash; 324 325static struct ibv_ah *create_ah(RdmaBackendDev *backend_dev, struct ibv_pd *pd, 326 uint8_t sgid_idx, union ibv_gid *dgid) 327{ 328 GBytes *ah_key = g_bytes_new(dgid, sizeof(*dgid)); 329 struct ibv_ah *ah = g_hash_table_lookup(ah_hash, ah_key); 330 331 if (ah) { 332 trace_rdma_create_ah_cache_hit(be64_to_cpu(dgid->global.subnet_prefix), 333 be64_to_cpu(dgid->global.interface_id)); 334 g_bytes_unref(ah_key); 335 } else { 336 struct ibv_ah_attr ah_attr = { 337 .is_global = 1, 338 .port_num = backend_dev->port_num, 339 .grh.hop_limit = 1, 340 }; 341 342 ah_attr.grh.dgid = *dgid; 343 ah_attr.grh.sgid_index = sgid_idx; 344 345 ah = ibv_create_ah(pd, &ah_attr); 346 if (ah) { 347 g_hash_table_insert(ah_hash, ah_key, ah); 348 } else { 349 g_bytes_unref(ah_key); 350 rdma_error_report("Failed to create AH for gid <0x%" PRIx64", 0x%"PRIx64">", 351 be64_to_cpu(dgid->global.subnet_prefix), 352 be64_to_cpu(dgid->global.interface_id)); 353 } 354 355 trace_rdma_create_ah_cache_miss(be64_to_cpu(dgid->global.subnet_prefix), 356 be64_to_cpu(dgid->global.interface_id)); 357 } 358 359 return ah; 360} 361 362static void destroy_ah_hash_key(gpointer data) 363{ 364 g_bytes_unref(data); 365} 366 367static void destroy_ah_hast_data(gpointer data) 368{ 369 struct ibv_ah *ah = data; 370 371 ibv_destroy_ah(ah); 372} 373 374static void ah_cache_init(void) 375{ 376 ah_hash = g_hash_table_new_full(g_bytes_hash, g_bytes_equal, 377 destroy_ah_hash_key, destroy_ah_hast_data); 378} 379 380#ifdef LEGACY_RDMA_REG_MR 381static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res, 382 struct ibv_sge *sge, uint8_t num_sge, 383 uint64_t *total_length) 384{ 385 RdmaRmMR *mr; 386 int idx; 387 388 for (idx = 0; idx < num_sge; idx++) { 389 mr = rdma_rm_get_mr(rdma_dev_res, sge[idx].lkey); 390 if (unlikely(!mr)) { 391 rdma_error_report("Invalid lkey 0x%x", sge[idx].lkey); 392 return VENDOR_ERR_INVLKEY | sge[idx].lkey; 393 } 394 395 sge[idx].addr = (uintptr_t)mr->virt + sge[idx].addr - mr->start; 396 sge[idx].lkey = rdma_backend_mr_lkey(&mr->backend_mr); 397 398 *total_length += sge[idx].length; 399 } 400 401 return 0; 402} 403#else 404static inline int build_host_sge_array(RdmaDeviceResources *rdma_dev_res, 405 struct ibv_sge *sge, uint8_t num_sge, 406 uint64_t *total_length) 407{ 408 int idx; 409 410 for (idx = 0; idx < num_sge; idx++) { 411 *total_length += sge[idx].length; 412 } 413 return 0; 414} 415#endif 416 417static void trace_mad_message(const char *title, char *buf, int len) 418{ 419 int i; 420 char *b = g_malloc0(len * 3 + 1); 421 char b1[4]; 422 423 for (i = 0; i < len; i++) { 424 sprintf(b1, "%.2X ", buf[i] & 0x000000FF); 425 strcat(b, b1); 426 } 427 428 trace_rdma_mad_message(title, len, b); 429 430 g_free(b); 431} 432 433static int mad_send(RdmaBackendDev *backend_dev, uint8_t sgid_idx, 434 union ibv_gid *sgid, struct ibv_sge *sge, uint32_t num_sge) 435{ 436 RdmaCmMuxMsg msg = {}; 437 char *hdr, *data; 438 int ret; 439 440 if (num_sge != 2) { 441 return -EINVAL; 442 } 443 444 msg.hdr.op_code = RDMACM_MUX_OP_CODE_MAD; 445 memcpy(msg.hdr.sgid.raw, sgid->raw, sizeof(msg.hdr.sgid)); 446 447 msg.umad_len = sge[0].length + sge[1].length; 448 449 if (msg.umad_len > sizeof(msg.umad.mad)) { 450 return -ENOMEM; 451 } 452 453 msg.umad.hdr.addr.qpn = htobe32(1); 454 msg.umad.hdr.addr.grh_present = 1; 455 msg.umad.hdr.addr.gid_index = sgid_idx; 456 memcpy(msg.umad.hdr.addr.gid, sgid->raw, sizeof(msg.umad.hdr.addr.gid)); 457 msg.umad.hdr.addr.hop_limit = 0xFF; 458 459 hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length); 460 if (!hdr) { 461 return -ENOMEM; 462 } 463 data = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length); 464 if (!data) { 465 rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length); 466 return -ENOMEM; 467 } 468 469 memcpy(&msg.umad.mad[0], hdr, sge[0].length); 470 memcpy(&msg.umad.mad[sge[0].length], data, sge[1].length); 471 472 rdma_pci_dma_unmap(backend_dev->dev, data, sge[1].length); 473 rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length); 474 475 trace_mad_message("send", msg.umad.mad, msg.umad_len); 476 477 ret = rdmacm_mux_send(backend_dev, &msg); 478 if (ret) { 479 rdma_error_report("Failed to send MAD to rdma_umadmux (%d)", ret); 480 return -EIO; 481 } 482 483 return 0; 484} 485 486void rdma_backend_post_send(RdmaBackendDev *backend_dev, 487 RdmaBackendQP *qp, uint8_t qp_type, 488 struct ibv_sge *sge, uint32_t num_sge, 489 uint8_t sgid_idx, union ibv_gid *sgid, 490 union ibv_gid *dgid, uint32_t dqpn, uint32_t dqkey, 491 void *ctx) 492{ 493 BackendCtx *bctx; 494 uint32_t bctx_id; 495 int rc; 496 struct ibv_send_wr wr = {}, *bad_wr; 497 498 if (!qp->ibqp) { /* This field is not initialized for QP0 and QP1 */ 499 if (qp_type == IBV_QPT_SMI) { 500 rdma_error_report("Got QP0 request"); 501 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx); 502 } else if (qp_type == IBV_QPT_GSI) { 503 rc = mad_send(backend_dev, sgid_idx, sgid, sge, num_sge); 504 if (rc) { 505 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); 506 backend_dev->rdma_dev_res->stats.mad_tx_err++; 507 } else { 508 complete_work(IBV_WC_SUCCESS, 0, ctx); 509 backend_dev->rdma_dev_res->stats.mad_tx++; 510 } 511 } 512 return; 513 } 514 515 bctx = g_malloc0(sizeof(*bctx)); 516 bctx->up_ctx = ctx; 517 bctx->backend_qp = qp; 518 519 rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx); 520 if (unlikely(rc)) { 521 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx); 522 goto err_free_bctx; 523 } 524 525 rdma_protected_gslist_append_int32(&qp->cqe_ctx_list, bctx_id); 526 527 rc = build_host_sge_array(backend_dev->rdma_dev_res, sge, num_sge, 528 &backend_dev->rdma_dev_res->stats.tx_len); 529 if (rc) { 530 complete_work(IBV_WC_GENERAL_ERR, rc, ctx); 531 goto err_dealloc_cqe_ctx; 532 } 533 534 if (qp_type == IBV_QPT_UD) { 535 wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd, sgid_idx, dgid); 536 if (!wr.wr.ud.ah) { 537 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx); 538 goto err_dealloc_cqe_ctx; 539 } 540 wr.wr.ud.remote_qpn = dqpn; 541 wr.wr.ud.remote_qkey = dqkey; 542 } 543 544 wr.num_sge = num_sge; 545 wr.opcode = IBV_WR_SEND; 546 wr.send_flags = IBV_SEND_SIGNALED; 547 wr.sg_list = sge; 548 wr.wr_id = bctx_id; 549 550 rc = ibv_post_send(qp->ibqp, &wr, &bad_wr); 551 if (rc) { 552 rdma_error_report("ibv_post_send fail, qpn=0x%x, rc=%d, errno=%d", 553 qp->ibqp->qp_num, rc, errno); 554 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx); 555 goto err_dealloc_cqe_ctx; 556 } 557 558 qatomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe); 559 backend_dev->rdma_dev_res->stats.tx++; 560 561 return; 562 563err_dealloc_cqe_ctx: 564 backend_dev->rdma_dev_res->stats.tx_err++; 565 rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id); 566 567err_free_bctx: 568 g_free(bctx); 569} 570 571static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev, 572 struct ibv_sge *sge, uint32_t num_sge, 573 void *ctx) 574{ 575 BackendCtx *bctx; 576 int rc; 577 uint32_t bctx_id; 578 579 if (num_sge != 1) { 580 rdma_error_report("Invalid num_sge (%d), expecting 1", num_sge); 581 return VENDOR_ERR_INV_NUM_SGE; 582 } 583 584 if (sge[0].length < RDMA_MAX_PRIVATE_DATA + sizeof(struct ibv_grh)) { 585 rdma_error_report("Too small buffer for MAD"); 586 return VENDOR_ERR_INV_MAD_BUFF; 587 } 588 589 bctx = g_malloc0(sizeof(*bctx)); 590 591 rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx); 592 if (unlikely(rc)) { 593 g_free(bctx); 594 return VENDOR_ERR_NOMEM; 595 } 596 597 bctx->up_ctx = ctx; 598 bctx->sge = *sge; 599 600 rdma_protected_gqueue_append_int64(&backend_dev->recv_mads_list, bctx_id); 601 602 return 0; 603} 604 605void rdma_backend_post_recv(RdmaBackendDev *backend_dev, 606 RdmaBackendQP *qp, uint8_t qp_type, 607 struct ibv_sge *sge, uint32_t num_sge, void *ctx) 608{ 609 BackendCtx *bctx; 610 uint32_t bctx_id; 611 int rc; 612 struct ibv_recv_wr wr = {}, *bad_wr; 613 614 if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */ 615 if (qp_type == IBV_QPT_SMI) { 616 rdma_error_report("Got QP0 request"); 617 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx); 618 } 619 if (qp_type == IBV_QPT_GSI) { 620 rc = save_mad_recv_buffer(backend_dev, sge, num_sge, ctx); 621 if (rc) { 622 complete_work(IBV_WC_GENERAL_ERR, rc, ctx); 623 backend_dev->rdma_dev_res->stats.mad_rx_bufs_err++; 624 } else { 625 backend_dev->rdma_dev_res->stats.mad_rx_bufs++; 626 } 627 } 628 return; 629 } 630 631 bctx = g_malloc0(sizeof(*bctx)); 632 bctx->up_ctx = ctx; 633 bctx->backend_qp = qp; 634 635 rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx); 636 if (unlikely(rc)) { 637 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx); 638 goto err_free_bctx; 639 } 640 641 rdma_protected_gslist_append_int32(&qp->cqe_ctx_list, bctx_id); 642 643 rc = build_host_sge_array(backend_dev->rdma_dev_res, sge, num_sge, 644 &backend_dev->rdma_dev_res->stats.rx_bufs_len); 645 if (rc) { 646 complete_work(IBV_WC_GENERAL_ERR, rc, ctx); 647 goto err_dealloc_cqe_ctx; 648 } 649 650 wr.num_sge = num_sge; 651 wr.sg_list = sge; 652 wr.wr_id = bctx_id; 653 rc = ibv_post_recv(qp->ibqp, &wr, &bad_wr); 654 if (rc) { 655 rdma_error_report("ibv_post_recv fail, qpn=0x%x, rc=%d, errno=%d", 656 qp->ibqp->qp_num, rc, errno); 657 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx); 658 goto err_dealloc_cqe_ctx; 659 } 660 661 qatomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe); 662 backend_dev->rdma_dev_res->stats.rx_bufs++; 663 664 return; 665 666err_dealloc_cqe_ctx: 667 backend_dev->rdma_dev_res->stats.rx_bufs_err++; 668 rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id); 669 670err_free_bctx: 671 g_free(bctx); 672} 673 674void rdma_backend_post_srq_recv(RdmaBackendDev *backend_dev, 675 RdmaBackendSRQ *srq, struct ibv_sge *sge, 676 uint32_t num_sge, void *ctx) 677{ 678 BackendCtx *bctx; 679 uint32_t bctx_id; 680 int rc; 681 struct ibv_recv_wr wr = {}, *bad_wr; 682 683 bctx = g_malloc0(sizeof(*bctx)); 684 bctx->up_ctx = ctx; 685 bctx->backend_srq = srq; 686 687 rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx); 688 if (unlikely(rc)) { 689 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx); 690 goto err_free_bctx; 691 } 692 693 rdma_protected_gslist_append_int32(&srq->cqe_ctx_list, bctx_id); 694 695 rc = build_host_sge_array(backend_dev->rdma_dev_res, sge, num_sge, 696 &backend_dev->rdma_dev_res->stats.rx_bufs_len); 697 if (rc) { 698 complete_work(IBV_WC_GENERAL_ERR, rc, ctx); 699 goto err_dealloc_cqe_ctx; 700 } 701 702 wr.num_sge = num_sge; 703 wr.sg_list = sge; 704 wr.wr_id = bctx_id; 705 rc = ibv_post_srq_recv(srq->ibsrq, &wr, &bad_wr); 706 if (rc) { 707 rdma_error_report("ibv_post_srq_recv fail, srqn=0x%x, rc=%d, errno=%d", 708 srq->ibsrq->handle, rc, errno); 709 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx); 710 goto err_dealloc_cqe_ctx; 711 } 712 713 qatomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe); 714 backend_dev->rdma_dev_res->stats.rx_bufs++; 715 backend_dev->rdma_dev_res->stats.rx_srq++; 716 717 return; 718 719err_dealloc_cqe_ctx: 720 backend_dev->rdma_dev_res->stats.rx_bufs_err++; 721 rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id); 722 723err_free_bctx: 724 g_free(bctx); 725} 726 727int rdma_backend_create_pd(RdmaBackendDev *backend_dev, RdmaBackendPD *pd) 728{ 729 pd->ibpd = ibv_alloc_pd(backend_dev->context); 730 731 if (!pd->ibpd) { 732 rdma_error_report("ibv_alloc_pd fail, errno=%d", errno); 733 return -EIO; 734 } 735 736 return 0; 737} 738 739void rdma_backend_destroy_pd(RdmaBackendPD *pd) 740{ 741 if (pd->ibpd) { 742 ibv_dealloc_pd(pd->ibpd); 743 } 744} 745 746int rdma_backend_create_mr(RdmaBackendMR *mr, RdmaBackendPD *pd, void *addr, 747 size_t length, uint64_t guest_start, int access) 748{ 749#ifdef LEGACY_RDMA_REG_MR 750 mr->ibmr = ibv_reg_mr(pd->ibpd, addr, length, access); 751#else 752 mr->ibmr = ibv_reg_mr_iova(pd->ibpd, addr, length, guest_start, access); 753#endif 754 if (!mr->ibmr) { 755 rdma_error_report("ibv_reg_mr fail, errno=%d", errno); 756 return -EIO; 757 } 758 759 mr->ibpd = pd->ibpd; 760 761 return 0; 762} 763 764void rdma_backend_destroy_mr(RdmaBackendMR *mr) 765{ 766 if (mr->ibmr) { 767 ibv_dereg_mr(mr->ibmr); 768 } 769} 770 771int rdma_backend_create_cq(RdmaBackendDev *backend_dev, RdmaBackendCQ *cq, 772 int cqe) 773{ 774 int rc; 775 776 cq->ibcq = ibv_create_cq(backend_dev->context, cqe + 1, NULL, 777 backend_dev->channel, 0); 778 if (!cq->ibcq) { 779 rdma_error_report("ibv_create_cq fail, errno=%d", errno); 780 return -EIO; 781 } 782 783 rc = ibv_req_notify_cq(cq->ibcq, 0); 784 if (rc) { 785 rdma_warn_report("ibv_req_notify_cq fail, rc=%d, errno=%d", rc, errno); 786 } 787 788 cq->backend_dev = backend_dev; 789 790 return 0; 791} 792 793void rdma_backend_destroy_cq(RdmaBackendCQ *cq) 794{ 795 if (cq->ibcq) { 796 ibv_destroy_cq(cq->ibcq); 797 } 798} 799 800int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type, 801 RdmaBackendPD *pd, RdmaBackendCQ *scq, 802 RdmaBackendCQ *rcq, RdmaBackendSRQ *srq, 803 uint32_t max_send_wr, uint32_t max_recv_wr, 804 uint32_t max_send_sge, uint32_t max_recv_sge) 805{ 806 struct ibv_qp_init_attr attr = {}; 807 808 qp->ibqp = 0; 809 810 switch (qp_type) { 811 case IBV_QPT_GSI: 812 return 0; 813 814 case IBV_QPT_RC: 815 /* fall through */ 816 case IBV_QPT_UD: 817 /* do nothing */ 818 break; 819 820 default: 821 rdma_error_report("Unsupported QP type %d", qp_type); 822 return -EIO; 823 } 824 825 attr.qp_type = qp_type; 826 attr.send_cq = scq->ibcq; 827 attr.recv_cq = rcq->ibcq; 828 attr.cap.max_send_wr = max_send_wr; 829 attr.cap.max_recv_wr = max_recv_wr; 830 attr.cap.max_send_sge = max_send_sge; 831 attr.cap.max_recv_sge = max_recv_sge; 832 if (srq) { 833 attr.srq = srq->ibsrq; 834 } 835 836 qp->ibqp = ibv_create_qp(pd->ibpd, &attr); 837 if (!qp->ibqp) { 838 rdma_error_report("ibv_create_qp fail, errno=%d", errno); 839 return -EIO; 840 } 841 842 rdma_protected_gslist_init(&qp->cqe_ctx_list); 843 844 qp->ibpd = pd->ibpd; 845 846 /* TODO: Query QP to get max_inline_data and save it to be used in send */ 847 848 return 0; 849} 850 851int rdma_backend_qp_state_init(RdmaBackendDev *backend_dev, RdmaBackendQP *qp, 852 uint8_t qp_type, uint32_t qkey) 853{ 854 struct ibv_qp_attr attr = {}; 855 int rc, attr_mask; 856 857 attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT; 858 attr.qp_state = IBV_QPS_INIT; 859 attr.pkey_index = 0; 860 attr.port_num = backend_dev->port_num; 861 862 switch (qp_type) { 863 case IBV_QPT_RC: 864 attr_mask |= IBV_QP_ACCESS_FLAGS; 865 trace_rdma_backend_rc_qp_state_init(qp->ibqp->qp_num); 866 break; 867 868 case IBV_QPT_UD: 869 attr.qkey = qkey; 870 attr_mask |= IBV_QP_QKEY; 871 trace_rdma_backend_ud_qp_state_init(qp->ibqp->qp_num, qkey); 872 break; 873 874 default: 875 rdma_error_report("Unsupported QP type %d", qp_type); 876 return -EIO; 877 } 878 879 rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask); 880 if (rc) { 881 rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno); 882 return -EIO; 883 } 884 885 return 0; 886} 887 888int rdma_backend_qp_state_rtr(RdmaBackendDev *backend_dev, RdmaBackendQP *qp, 889 uint8_t qp_type, uint8_t sgid_idx, 890 union ibv_gid *dgid, uint32_t dqpn, 891 uint32_t rq_psn, uint32_t qkey, bool use_qkey) 892{ 893 struct ibv_qp_attr attr = {}; 894 union ibv_gid ibv_gid = { 895 .global.interface_id = dgid->global.interface_id, 896 .global.subnet_prefix = dgid->global.subnet_prefix 897 }; 898 int rc, attr_mask; 899 900 attr.qp_state = IBV_QPS_RTR; 901 attr_mask = IBV_QP_STATE; 902 903 qp->sgid_idx = sgid_idx; 904 905 switch (qp_type) { 906 case IBV_QPT_RC: 907 attr.path_mtu = IBV_MTU_1024; 908 attr.dest_qp_num = dqpn; 909 attr.max_dest_rd_atomic = 1; 910 attr.min_rnr_timer = 12; 911 attr.ah_attr.port_num = backend_dev->port_num; 912 attr.ah_attr.is_global = 1; 913 attr.ah_attr.grh.hop_limit = 1; 914 attr.ah_attr.grh.dgid = ibv_gid; 915 attr.ah_attr.grh.sgid_index = qp->sgid_idx; 916 attr.rq_psn = rq_psn; 917 918 attr_mask |= IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | 919 IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | 920 IBV_QP_MIN_RNR_TIMER; 921 922 trace_rdma_backend_rc_qp_state_rtr(qp->ibqp->qp_num, 923 be64_to_cpu(ibv_gid.global. 924 subnet_prefix), 925 be64_to_cpu(ibv_gid.global. 926 interface_id), 927 qp->sgid_idx, dqpn, rq_psn); 928 break; 929 930 case IBV_QPT_UD: 931 if (use_qkey) { 932 attr.qkey = qkey; 933 attr_mask |= IBV_QP_QKEY; 934 } 935 trace_rdma_backend_ud_qp_state_rtr(qp->ibqp->qp_num, use_qkey ? qkey : 936 0); 937 break; 938 } 939 940 rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask); 941 if (rc) { 942 rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno); 943 return -EIO; 944 } 945 946 return 0; 947} 948 949int rdma_backend_qp_state_rts(RdmaBackendQP *qp, uint8_t qp_type, 950 uint32_t sq_psn, uint32_t qkey, bool use_qkey) 951{ 952 struct ibv_qp_attr attr = {}; 953 int rc, attr_mask; 954 955 attr.qp_state = IBV_QPS_RTS; 956 attr.sq_psn = sq_psn; 957 attr_mask = IBV_QP_STATE | IBV_QP_SQ_PSN; 958 959 switch (qp_type) { 960 case IBV_QPT_RC: 961 attr.timeout = 14; 962 attr.retry_cnt = 7; 963 attr.rnr_retry = 7; 964 attr.max_rd_atomic = 1; 965 966 attr_mask |= IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | 967 IBV_QP_MAX_QP_RD_ATOMIC; 968 trace_rdma_backend_rc_qp_state_rts(qp->ibqp->qp_num, sq_psn); 969 break; 970 971 case IBV_QPT_UD: 972 if (use_qkey) { 973 attr.qkey = qkey; 974 attr_mask |= IBV_QP_QKEY; 975 } 976 trace_rdma_backend_ud_qp_state_rts(qp->ibqp->qp_num, sq_psn, 977 use_qkey ? qkey : 0); 978 break; 979 } 980 981 rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask); 982 if (rc) { 983 rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno); 984 return -EIO; 985 } 986 987 return 0; 988} 989 990int rdma_backend_query_qp(RdmaBackendQP *qp, struct ibv_qp_attr *attr, 991 int attr_mask, struct ibv_qp_init_attr *init_attr) 992{ 993 if (!qp->ibqp) { 994 attr->qp_state = IBV_QPS_RTS; 995 return 0; 996 } 997 998 return ibv_query_qp(qp->ibqp, attr, attr_mask, init_attr); 999} 1000 1001void rdma_backend_destroy_qp(RdmaBackendQP *qp, RdmaDeviceResources *dev_res) 1002{ 1003 if (qp->ibqp) { 1004 ibv_destroy_qp(qp->ibqp); 1005 } 1006 g_slist_foreach(qp->cqe_ctx_list.list, free_cqe_ctx, dev_res); 1007 rdma_protected_gslist_destroy(&qp->cqe_ctx_list); 1008} 1009 1010int rdma_backend_create_srq(RdmaBackendSRQ *srq, RdmaBackendPD *pd, 1011 uint32_t max_wr, uint32_t max_sge, 1012 uint32_t srq_limit) 1013{ 1014 struct ibv_srq_init_attr srq_init_attr = {}; 1015 1016 srq_init_attr.attr.max_wr = max_wr; 1017 srq_init_attr.attr.max_sge = max_sge; 1018 srq_init_attr.attr.srq_limit = srq_limit; 1019 1020 srq->ibsrq = ibv_create_srq(pd->ibpd, &srq_init_attr); 1021 if (!srq->ibsrq) { 1022 rdma_error_report("ibv_create_srq failed, errno=%d", errno); 1023 return -EIO; 1024 } 1025 1026 rdma_protected_gslist_init(&srq->cqe_ctx_list); 1027 1028 return 0; 1029} 1030 1031int rdma_backend_query_srq(RdmaBackendSRQ *srq, struct ibv_srq_attr *srq_attr) 1032{ 1033 if (!srq->ibsrq) { 1034 return -EINVAL; 1035 } 1036 1037 return ibv_query_srq(srq->ibsrq, srq_attr); 1038} 1039 1040int rdma_backend_modify_srq(RdmaBackendSRQ *srq, struct ibv_srq_attr *srq_attr, 1041 int srq_attr_mask) 1042{ 1043 if (!srq->ibsrq) { 1044 return -EINVAL; 1045 } 1046 1047 return ibv_modify_srq(srq->ibsrq, srq_attr, srq_attr_mask); 1048} 1049 1050void rdma_backend_destroy_srq(RdmaBackendSRQ *srq, RdmaDeviceResources *dev_res) 1051{ 1052 if (srq->ibsrq) { 1053 ibv_destroy_srq(srq->ibsrq); 1054 } 1055 g_slist_foreach(srq->cqe_ctx_list.list, free_cqe_ctx, dev_res); 1056 rdma_protected_gslist_destroy(&srq->cqe_ctx_list); 1057} 1058 1059#define CHK_ATTR(req, dev, member, fmt) ({ \ 1060 trace_rdma_check_dev_attr(#member, dev.member, req->member); \ 1061 if (req->member > dev.member) { \ 1062 rdma_warn_report("%s = "fmt" is higher than host device capability "fmt, \ 1063 #member, req->member, dev.member); \ 1064 req->member = dev.member; \ 1065 } \ 1066}) 1067 1068static int init_device_caps(RdmaBackendDev *backend_dev, 1069 struct ibv_device_attr *dev_attr) 1070{ 1071 struct ibv_device_attr bk_dev_attr; 1072 int rc; 1073 1074 rc = ibv_query_device(backend_dev->context, &bk_dev_attr); 1075 if (rc) { 1076 rdma_error_report("ibv_query_device fail, rc=%d, errno=%d", rc, errno); 1077 return -EIO; 1078 } 1079 1080 dev_attr->max_sge = MAX_SGE; 1081 dev_attr->max_srq_sge = MAX_SGE; 1082 1083 CHK_ATTR(dev_attr, bk_dev_attr, max_mr_size, "%" PRId64); 1084 CHK_ATTR(dev_attr, bk_dev_attr, max_qp, "%d"); 1085 CHK_ATTR(dev_attr, bk_dev_attr, max_sge, "%d"); 1086 CHK_ATTR(dev_attr, bk_dev_attr, max_cq, "%d"); 1087 CHK_ATTR(dev_attr, bk_dev_attr, max_mr, "%d"); 1088 CHK_ATTR(dev_attr, bk_dev_attr, max_pd, "%d"); 1089 CHK_ATTR(dev_attr, bk_dev_attr, max_qp_rd_atom, "%d"); 1090 CHK_ATTR(dev_attr, bk_dev_attr, max_qp_init_rd_atom, "%d"); 1091 CHK_ATTR(dev_attr, bk_dev_attr, max_ah, "%d"); 1092 CHK_ATTR(dev_attr, bk_dev_attr, max_srq, "%d"); 1093 1094 return 0; 1095} 1096 1097static inline void build_mad_hdr(struct ibv_grh *grh, union ibv_gid *sgid, 1098 union ibv_gid *my_gid, int paylen) 1099{ 1100 grh->paylen = htons(paylen); 1101 grh->sgid = *sgid; 1102 grh->dgid = *my_gid; 1103} 1104 1105static void process_incoming_mad_req(RdmaBackendDev *backend_dev, 1106 RdmaCmMuxMsg *msg) 1107{ 1108 unsigned long cqe_ctx_id; 1109 BackendCtx *bctx; 1110 char *mad; 1111 1112 trace_mad_message("recv", msg->umad.mad, msg->umad_len); 1113 1114 cqe_ctx_id = rdma_protected_gqueue_pop_int64(&backend_dev->recv_mads_list); 1115 if (cqe_ctx_id == -ENOENT) { 1116 rdma_warn_report("No more free MADs buffers, waiting for a while"); 1117 sleep(THR_POLL_TO); 1118 return; 1119 } 1120 1121 bctx = rdma_rm_get_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id); 1122 if (unlikely(!bctx)) { 1123 rdma_error_report("No matching ctx for req %ld", cqe_ctx_id); 1124 backend_dev->rdma_dev_res->stats.mad_rx_err++; 1125 return; 1126 } 1127 1128 mad = rdma_pci_dma_map(backend_dev->dev, bctx->sge.addr, 1129 bctx->sge.length); 1130 if (!mad || bctx->sge.length < msg->umad_len + MAD_HDR_SIZE) { 1131 backend_dev->rdma_dev_res->stats.mad_rx_err++; 1132 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_INV_MAD_BUFF, 1133 bctx->up_ctx); 1134 } else { 1135 struct ibv_wc wc = {}; 1136 memset(mad, 0, bctx->sge.length); 1137 build_mad_hdr((struct ibv_grh *)mad, 1138 (union ibv_gid *)&msg->umad.hdr.addr.gid, &msg->hdr.sgid, 1139 msg->umad_len); 1140 memcpy(&mad[MAD_HDR_SIZE], msg->umad.mad, msg->umad_len); 1141 rdma_pci_dma_unmap(backend_dev->dev, mad, bctx->sge.length); 1142 1143 wc.byte_len = msg->umad_len; 1144 wc.status = IBV_WC_SUCCESS; 1145 wc.wc_flags = IBV_WC_GRH; 1146 backend_dev->rdma_dev_res->stats.mad_rx++; 1147 comp_handler(bctx->up_ctx, &wc); 1148 } 1149 1150 g_free(bctx); 1151 rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id); 1152} 1153 1154static inline int rdmacm_mux_can_receive(void *opaque) 1155{ 1156 RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque; 1157 1158 return rdmacm_mux_can_process_async(backend_dev); 1159} 1160 1161static void rdmacm_mux_read(void *opaque, const uint8_t *buf, int size) 1162{ 1163 RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque; 1164 RdmaCmMuxMsg *msg = (RdmaCmMuxMsg *)buf; 1165 1166 trace_rdmacm_mux("read", msg->hdr.msg_type, msg->hdr.op_code); 1167 1168 if (msg->hdr.msg_type != RDMACM_MUX_MSG_TYPE_REQ && 1169 msg->hdr.op_code != RDMACM_MUX_OP_CODE_MAD) { 1170 rdma_error_report("Error: Not a MAD request, skipping"); 1171 return; 1172 } 1173 process_incoming_mad_req(backend_dev, msg); 1174} 1175 1176static int mad_init(RdmaBackendDev *backend_dev, CharBackend *mad_chr_be) 1177{ 1178 int ret; 1179 1180 backend_dev->rdmacm_mux.chr_be = mad_chr_be; 1181 1182 ret = qemu_chr_fe_backend_connected(backend_dev->rdmacm_mux.chr_be); 1183 if (!ret) { 1184 rdma_error_report("Missing chardev for MAD multiplexer"); 1185 return -EIO; 1186 } 1187 1188 rdma_protected_gqueue_init(&backend_dev->recv_mads_list); 1189 1190 enable_rdmacm_mux_async(backend_dev); 1191 1192 qemu_chr_fe_set_handlers(backend_dev->rdmacm_mux.chr_be, 1193 rdmacm_mux_can_receive, rdmacm_mux_read, NULL, 1194 NULL, backend_dev, NULL, true); 1195 1196 return 0; 1197} 1198 1199static void mad_stop(RdmaBackendDev *backend_dev) 1200{ 1201 clean_recv_mads(backend_dev); 1202} 1203 1204static void mad_fini(RdmaBackendDev *backend_dev) 1205{ 1206 disable_rdmacm_mux_async(backend_dev); 1207 qemu_chr_fe_disconnect(backend_dev->rdmacm_mux.chr_be); 1208 rdma_protected_gqueue_destroy(&backend_dev->recv_mads_list); 1209} 1210 1211int rdma_backend_get_gid_index(RdmaBackendDev *backend_dev, 1212 union ibv_gid *gid) 1213{ 1214 union ibv_gid sgid; 1215 int ret; 1216 int i = 0; 1217 1218 do { 1219 ret = ibv_query_gid(backend_dev->context, backend_dev->port_num, i, 1220 &sgid); 1221 i++; 1222 } while (!ret && (memcmp(&sgid, gid, sizeof(*gid)))); 1223 1224 trace_rdma_backend_get_gid_index(be64_to_cpu(gid->global.subnet_prefix), 1225 be64_to_cpu(gid->global.interface_id), 1226 i - 1); 1227 1228 return ret ? ret : i - 1; 1229} 1230 1231int rdma_backend_add_gid(RdmaBackendDev *backend_dev, const char *ifname, 1232 union ibv_gid *gid) 1233{ 1234 RdmaCmMuxMsg msg = {}; 1235 int ret; 1236 1237 trace_rdma_backend_gid_change("add", be64_to_cpu(gid->global.subnet_prefix), 1238 be64_to_cpu(gid->global.interface_id)); 1239 1240 msg.hdr.op_code = RDMACM_MUX_OP_CODE_REG; 1241 memcpy(msg.hdr.sgid.raw, gid->raw, sizeof(msg.hdr.sgid)); 1242 1243 ret = rdmacm_mux_send(backend_dev, &msg); 1244 if (ret) { 1245 rdma_error_report("Failed to register GID to rdma_umadmux (%d)", ret); 1246 return -EIO; 1247 } 1248 1249 qapi_event_send_rdma_gid_status_changed(ifname, true, 1250 gid->global.subnet_prefix, 1251 gid->global.interface_id); 1252 1253 return ret; 1254} 1255 1256int rdma_backend_del_gid(RdmaBackendDev *backend_dev, const char *ifname, 1257 union ibv_gid *gid) 1258{ 1259 RdmaCmMuxMsg msg = {}; 1260 int ret; 1261 1262 trace_rdma_backend_gid_change("del", be64_to_cpu(gid->global.subnet_prefix), 1263 be64_to_cpu(gid->global.interface_id)); 1264 1265 msg.hdr.op_code = RDMACM_MUX_OP_CODE_UNREG; 1266 memcpy(msg.hdr.sgid.raw, gid->raw, sizeof(msg.hdr.sgid)); 1267 1268 ret = rdmacm_mux_send(backend_dev, &msg); 1269 if (ret) { 1270 rdma_error_report("Failed to unregister GID from rdma_umadmux (%d)", 1271 ret); 1272 return -EIO; 1273 } 1274 1275 qapi_event_send_rdma_gid_status_changed(ifname, false, 1276 gid->global.subnet_prefix, 1277 gid->global.interface_id); 1278 1279 return 0; 1280} 1281 1282int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, 1283 RdmaDeviceResources *rdma_dev_res, 1284 const char *backend_device_name, uint8_t port_num, 1285 struct ibv_device_attr *dev_attr, CharBackend *mad_chr_be) 1286{ 1287 int i; 1288 int ret = 0; 1289 int num_ibv_devices; 1290 struct ibv_device **dev_list; 1291 1292 memset(backend_dev, 0, sizeof(*backend_dev)); 1293 1294 backend_dev->dev = pdev; 1295 backend_dev->port_num = port_num; 1296 backend_dev->rdma_dev_res = rdma_dev_res; 1297 1298 rdma_backend_register_comp_handler(dummy_comp_handler); 1299 1300 dev_list = ibv_get_device_list(&num_ibv_devices); 1301 if (!dev_list) { 1302 rdma_error_report("Failed to get IB devices list"); 1303 return -EIO; 1304 } 1305 1306 if (num_ibv_devices == 0) { 1307 rdma_error_report("No IB devices were found"); 1308 ret = -ENXIO; 1309 goto out_free_dev_list; 1310 } 1311 1312 if (backend_device_name) { 1313 for (i = 0; dev_list[i]; ++i) { 1314 if (!strcmp(ibv_get_device_name(dev_list[i]), 1315 backend_device_name)) { 1316 break; 1317 } 1318 } 1319 1320 backend_dev->ib_dev = dev_list[i]; 1321 if (!backend_dev->ib_dev) { 1322 rdma_error_report("Failed to find IB device %s", 1323 backend_device_name); 1324 ret = -EIO; 1325 goto out_free_dev_list; 1326 } 1327 } else { 1328 backend_dev->ib_dev = *dev_list; 1329 } 1330 1331 rdma_info_report("uverb device %s", backend_dev->ib_dev->dev_name); 1332 1333 backend_dev->context = ibv_open_device(backend_dev->ib_dev); 1334 if (!backend_dev->context) { 1335 rdma_error_report("Failed to open IB device %s", 1336 ibv_get_device_name(backend_dev->ib_dev)); 1337 ret = -EIO; 1338 goto out; 1339 } 1340 1341 backend_dev->channel = ibv_create_comp_channel(backend_dev->context); 1342 if (!backend_dev->channel) { 1343 rdma_error_report("Failed to create IB communication channel"); 1344 ret = -EIO; 1345 goto out_close_device; 1346 } 1347 1348 ret = init_device_caps(backend_dev, dev_attr); 1349 if (ret) { 1350 rdma_error_report("Failed to initialize device capabilities"); 1351 ret = -EIO; 1352 goto out_destroy_comm_channel; 1353 } 1354 1355 1356 ret = mad_init(backend_dev, mad_chr_be); 1357 if (ret) { 1358 rdma_error_report("Failed to initialize mad"); 1359 ret = -EIO; 1360 goto out_destroy_comm_channel; 1361 } 1362 1363 backend_dev->comp_thread.run = false; 1364 backend_dev->comp_thread.is_running = false; 1365 1366 ah_cache_init(); 1367 1368 goto out_free_dev_list; 1369 1370out_destroy_comm_channel: 1371 ibv_destroy_comp_channel(backend_dev->channel); 1372 1373out_close_device: 1374 ibv_close_device(backend_dev->context); 1375 1376out_free_dev_list: 1377 ibv_free_device_list(dev_list); 1378 1379out: 1380 return ret; 1381} 1382 1383 1384void rdma_backend_start(RdmaBackendDev *backend_dev) 1385{ 1386 start_comp_thread(backend_dev); 1387} 1388 1389void rdma_backend_stop(RdmaBackendDev *backend_dev) 1390{ 1391 mad_stop(backend_dev); 1392 stop_backend_thread(&backend_dev->comp_thread); 1393} 1394 1395void rdma_backend_fini(RdmaBackendDev *backend_dev) 1396{ 1397 mad_fini(backend_dev); 1398 g_hash_table_destroy(ah_hash); 1399 ibv_destroy_comp_channel(backend_dev->channel); 1400 ibv_close_device(backend_dev->context); 1401}