cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

rdma_backend.c (41105B)


      1/*
      2 * QEMU paravirtual RDMA - Generic RDMA backend
      3 *
      4 * Copyright (C) 2018 Oracle
      5 * Copyright (C) 2018 Red Hat Inc
      6 *
      7 * Authors:
      8 *     Yuval Shaia <yuval.shaia@oracle.com>
      9 *     Marcel Apfelbaum <marcel@redhat.com>
     10 *
     11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
     12 * See the COPYING file in the top-level directory.
     13 *
     14 */
     15
     16#include "qemu/osdep.h"
     17#include "qapi/qapi-events-rdma.h"
     18
     19#include <infiniband/verbs.h>
     20
     21#include "contrib/rdmacm-mux/rdmacm-mux.h"
     22#include "trace.h"
     23#include "rdma_utils.h"
     24#include "rdma_rm.h"
     25#include "rdma_backend.h"
     26
     27#define THR_NAME_LEN 16
     28#define THR_POLL_TO  5000
     29
     30#define MAD_HDR_SIZE sizeof(struct ibv_grh)
     31
     32typedef struct BackendCtx {
     33    void *up_ctx;
     34    struct ibv_sge sge; /* Used to save MAD recv buffer */
     35    RdmaBackendQP *backend_qp; /* To maintain recv buffers */
     36    RdmaBackendSRQ *backend_srq;
     37} BackendCtx;
     38
     39struct backend_umad {
     40    struct ib_user_mad hdr;
     41    char mad[RDMA_MAX_PRIVATE_DATA];
     42};
     43
     44static void (*comp_handler)(void *ctx, struct ibv_wc *wc);
     45
     46static void dummy_comp_handler(void *ctx, struct ibv_wc *wc)
     47{
     48    rdma_error_report("No completion handler is registered");
     49}
     50
     51static inline void complete_work(enum ibv_wc_status status, uint32_t vendor_err,
     52                                 void *ctx)
     53{
     54    struct ibv_wc wc = {};
     55
     56    wc.status = status;
     57    wc.vendor_err = vendor_err;
     58
     59    comp_handler(ctx, &wc);
     60}
     61
     62static void free_cqe_ctx(gpointer data, gpointer user_data)
     63{
     64    BackendCtx *bctx;
     65    RdmaDeviceResources *rdma_dev_res = user_data;
     66    unsigned long cqe_ctx_id = GPOINTER_TO_INT(data);
     67
     68    bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, cqe_ctx_id);
     69    if (bctx) {
     70        rdma_rm_dealloc_cqe_ctx(rdma_dev_res, cqe_ctx_id);
     71        qatomic_dec(&rdma_dev_res->stats.missing_cqe);
     72    }
     73    g_free(bctx);
     74}
     75
     76static void clean_recv_mads(RdmaBackendDev *backend_dev)
     77{
     78    unsigned long cqe_ctx_id;
     79
     80    do {
     81        cqe_ctx_id = rdma_protected_gqueue_pop_int64(&backend_dev->
     82                                                    recv_mads_list);
     83        if (cqe_ctx_id != -ENOENT) {
     84            qatomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe);
     85            free_cqe_ctx(GINT_TO_POINTER(cqe_ctx_id),
     86                         backend_dev->rdma_dev_res);
     87        }
     88    } while (cqe_ctx_id != -ENOENT);
     89}
     90
     91static int rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq)
     92{
     93    int i, ne, total_ne = 0;
     94    BackendCtx *bctx;
     95    struct ibv_wc wc[2];
     96    RdmaProtectedGSList *cqe_ctx_list;
     97
     98    WITH_QEMU_LOCK_GUARD(&rdma_dev_res->lock) {
     99        do {
    100            ne = ibv_poll_cq(ibcq, ARRAY_SIZE(wc), wc);
    101
    102            trace_rdma_poll_cq(ne, ibcq);
    103
    104            for (i = 0; i < ne; i++) {
    105                bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, wc[i].wr_id);
    106                if (unlikely(!bctx)) {
    107                    rdma_error_report("No matching ctx for req %"PRId64,
    108                                      wc[i].wr_id);
    109                    continue;
    110                }
    111
    112                comp_handler(bctx->up_ctx, &wc[i]);
    113
    114                if (bctx->backend_qp) {
    115                    cqe_ctx_list = &bctx->backend_qp->cqe_ctx_list;
    116                } else {
    117                    cqe_ctx_list = &bctx->backend_srq->cqe_ctx_list;
    118                }
    119
    120                rdma_protected_gslist_remove_int32(cqe_ctx_list, wc[i].wr_id);
    121                rdma_rm_dealloc_cqe_ctx(rdma_dev_res, wc[i].wr_id);
    122                g_free(bctx);
    123            }
    124            total_ne += ne;
    125        } while (ne > 0);
    126        qatomic_sub(&rdma_dev_res->stats.missing_cqe, total_ne);
    127    }
    128
    129    if (ne < 0) {
    130        rdma_error_report("ibv_poll_cq fail, rc=%d, errno=%d", ne, errno);
    131    }
    132
    133    rdma_dev_res->stats.completions += total_ne;
    134
    135    return total_ne;
    136}
    137
    138static void *comp_handler_thread(void *arg)
    139{
    140    RdmaBackendDev *backend_dev = (RdmaBackendDev *)arg;
    141    int rc;
    142    struct ibv_cq *ev_cq;
    143    void *ev_ctx;
    144    int flags;
    145    GPollFD pfds[1];
    146
    147    /* Change to non-blocking mode */
    148    flags = fcntl(backend_dev->channel->fd, F_GETFL);
    149    rc = fcntl(backend_dev->channel->fd, F_SETFL, flags | O_NONBLOCK);
    150    if (rc < 0) {
    151        rdma_error_report("Failed to change backend channel FD to non-blocking");
    152        return NULL;
    153    }
    154
    155    pfds[0].fd = backend_dev->channel->fd;
    156    pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
    157
    158    backend_dev->comp_thread.is_running = true;
    159
    160    while (backend_dev->comp_thread.run) {
    161        do {
    162            rc = qemu_poll_ns(pfds, 1, THR_POLL_TO * (int64_t)SCALE_MS);
    163            if (!rc) {
    164                backend_dev->rdma_dev_res->stats.poll_cq_ppoll_to++;
    165            }
    166        } while (!rc && backend_dev->comp_thread.run);
    167
    168        if (backend_dev->comp_thread.run) {
    169            rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx);
    170            if (unlikely(rc)) {
    171                rdma_error_report("ibv_get_cq_event fail, rc=%d, errno=%d", rc,
    172                                  errno);
    173                continue;
    174            }
    175
    176            rc = ibv_req_notify_cq(ev_cq, 0);
    177            if (unlikely(rc)) {
    178                rdma_error_report("ibv_req_notify_cq fail, rc=%d, errno=%d", rc,
    179                                  errno);
    180            }
    181
    182            backend_dev->rdma_dev_res->stats.poll_cq_from_bk++;
    183            rdma_poll_cq(backend_dev->rdma_dev_res, ev_cq);
    184
    185            ibv_ack_cq_events(ev_cq, 1);
    186        }
    187    }
    188
    189    backend_dev->comp_thread.is_running = false;
    190
    191    qemu_thread_exit(0);
    192
    193    return NULL;
    194}
    195
    196static inline void disable_rdmacm_mux_async(RdmaBackendDev *backend_dev)
    197{
    198    qatomic_set(&backend_dev->rdmacm_mux.can_receive, 0);
    199}
    200
    201static inline void enable_rdmacm_mux_async(RdmaBackendDev *backend_dev)
    202{
    203    qatomic_set(&backend_dev->rdmacm_mux.can_receive, sizeof(RdmaCmMuxMsg));
    204}
    205
    206static inline int rdmacm_mux_can_process_async(RdmaBackendDev *backend_dev)
    207{
    208    return qatomic_read(&backend_dev->rdmacm_mux.can_receive);
    209}
    210
    211static int rdmacm_mux_check_op_status(CharBackend *mad_chr_be)
    212{
    213    RdmaCmMuxMsg msg = {};
    214    int ret;
    215
    216    ret = qemu_chr_fe_read_all(mad_chr_be, (uint8_t *)&msg, sizeof(msg));
    217    if (ret != sizeof(msg)) {
    218        rdma_error_report("Got invalid message from mux: size %d, expecting %d",
    219                          ret, (int)sizeof(msg));
    220        return -EIO;
    221    }
    222
    223    trace_rdmacm_mux_check_op_status(msg.hdr.msg_type, msg.hdr.op_code,
    224                                     msg.hdr.err_code);
    225
    226    if (msg.hdr.msg_type != RDMACM_MUX_MSG_TYPE_RESP) {
    227        rdma_error_report("Got invalid message type %d", msg.hdr.msg_type);
    228        return -EIO;
    229    }
    230
    231    if (msg.hdr.err_code != RDMACM_MUX_ERR_CODE_OK) {
    232        rdma_error_report("Operation failed in mux, error code %d",
    233                          msg.hdr.err_code);
    234        return -EIO;
    235    }
    236
    237    return 0;
    238}
    239
    240static int rdmacm_mux_send(RdmaBackendDev *backend_dev, RdmaCmMuxMsg *msg)
    241{
    242    int rc = 0;
    243
    244    msg->hdr.msg_type = RDMACM_MUX_MSG_TYPE_REQ;
    245    trace_rdmacm_mux("send", msg->hdr.msg_type, msg->hdr.op_code);
    246    disable_rdmacm_mux_async(backend_dev);
    247    rc = qemu_chr_fe_write(backend_dev->rdmacm_mux.chr_be,
    248                           (const uint8_t *)msg, sizeof(*msg));
    249    if (rc != sizeof(*msg)) {
    250        enable_rdmacm_mux_async(backend_dev);
    251        rdma_error_report("Failed to send request to rdmacm_mux (rc=%d)", rc);
    252        return -EIO;
    253    }
    254
    255    rc = rdmacm_mux_check_op_status(backend_dev->rdmacm_mux.chr_be);
    256    if (rc) {
    257        rdma_error_report("Failed to execute rdmacm_mux request %d (rc=%d)",
    258                          msg->hdr.op_code, rc);
    259    }
    260
    261    enable_rdmacm_mux_async(backend_dev);
    262
    263    return 0;
    264}
    265
    266static void stop_backend_thread(RdmaBackendThread *thread)
    267{
    268    thread->run = false;
    269    while (thread->is_running) {
    270        sleep(THR_POLL_TO / SCALE_US / 2);
    271    }
    272}
    273
    274static void start_comp_thread(RdmaBackendDev *backend_dev)
    275{
    276    char thread_name[THR_NAME_LEN] = {};
    277
    278    stop_backend_thread(&backend_dev->comp_thread);
    279
    280    snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s",
    281             ibv_get_device_name(backend_dev->ib_dev));
    282    backend_dev->comp_thread.run = true;
    283    qemu_thread_create(&backend_dev->comp_thread.thread, thread_name,
    284                       comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
    285}
    286
    287void rdma_backend_register_comp_handler(void (*handler)(void *ctx,
    288                                                         struct ibv_wc *wc))
    289{
    290    comp_handler = handler;
    291}
    292
    293void rdma_backend_unregister_comp_handler(void)
    294{
    295    rdma_backend_register_comp_handler(dummy_comp_handler);
    296}
    297
    298int rdma_backend_query_port(RdmaBackendDev *backend_dev,
    299                            struct ibv_port_attr *port_attr)
    300{
    301    int rc;
    302
    303    rc = ibv_query_port(backend_dev->context, backend_dev->port_num, port_attr);
    304    if (rc) {
    305        rdma_error_report("ibv_query_port fail, rc=%d, errno=%d", rc, errno);
    306        return -EIO;
    307    }
    308
    309    return 0;
    310}
    311
    312void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq)
    313{
    314    int polled;
    315
    316    rdma_dev_res->stats.poll_cq_from_guest++;
    317    polled = rdma_poll_cq(rdma_dev_res, cq->ibcq);
    318    if (!polled) {
    319        rdma_dev_res->stats.poll_cq_from_guest_empty++;
    320    }
    321}
    322
    323static GHashTable *ah_hash;
    324
    325static struct ibv_ah *create_ah(RdmaBackendDev *backend_dev, struct ibv_pd *pd,
    326                                uint8_t sgid_idx, union ibv_gid *dgid)
    327{
    328    GBytes *ah_key = g_bytes_new(dgid, sizeof(*dgid));
    329    struct ibv_ah *ah = g_hash_table_lookup(ah_hash, ah_key);
    330
    331    if (ah) {
    332        trace_rdma_create_ah_cache_hit(be64_to_cpu(dgid->global.subnet_prefix),
    333                                       be64_to_cpu(dgid->global.interface_id));
    334        g_bytes_unref(ah_key);
    335    } else {
    336        struct ibv_ah_attr ah_attr = {
    337            .is_global     = 1,
    338            .port_num      = backend_dev->port_num,
    339            .grh.hop_limit = 1,
    340        };
    341
    342        ah_attr.grh.dgid = *dgid;
    343        ah_attr.grh.sgid_index = sgid_idx;
    344
    345        ah = ibv_create_ah(pd, &ah_attr);
    346        if (ah) {
    347            g_hash_table_insert(ah_hash, ah_key, ah);
    348        } else {
    349            g_bytes_unref(ah_key);
    350            rdma_error_report("Failed to create AH for gid <0x%" PRIx64", 0x%"PRIx64">",
    351                              be64_to_cpu(dgid->global.subnet_prefix),
    352                              be64_to_cpu(dgid->global.interface_id));
    353        }
    354
    355        trace_rdma_create_ah_cache_miss(be64_to_cpu(dgid->global.subnet_prefix),
    356                                        be64_to_cpu(dgid->global.interface_id));
    357    }
    358
    359    return ah;
    360}
    361
    362static void destroy_ah_hash_key(gpointer data)
    363{
    364    g_bytes_unref(data);
    365}
    366
    367static void destroy_ah_hast_data(gpointer data)
    368{
    369    struct ibv_ah *ah = data;
    370
    371    ibv_destroy_ah(ah);
    372}
    373
    374static void ah_cache_init(void)
    375{
    376    ah_hash = g_hash_table_new_full(g_bytes_hash, g_bytes_equal,
    377                                    destroy_ah_hash_key, destroy_ah_hast_data);
    378}
    379
    380#ifdef LEGACY_RDMA_REG_MR
    381static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res,
    382                                struct ibv_sge *sge, uint8_t num_sge,
    383                                uint64_t *total_length)
    384{
    385    RdmaRmMR *mr;
    386    int idx;
    387
    388    for (idx = 0; idx < num_sge; idx++) {
    389        mr = rdma_rm_get_mr(rdma_dev_res, sge[idx].lkey);
    390        if (unlikely(!mr)) {
    391            rdma_error_report("Invalid lkey 0x%x", sge[idx].lkey);
    392            return VENDOR_ERR_INVLKEY | sge[idx].lkey;
    393        }
    394
    395        sge[idx].addr = (uintptr_t)mr->virt + sge[idx].addr - mr->start;
    396        sge[idx].lkey = rdma_backend_mr_lkey(&mr->backend_mr);
    397
    398        *total_length += sge[idx].length;
    399    }
    400
    401    return 0;
    402}
    403#else
    404static inline int build_host_sge_array(RdmaDeviceResources *rdma_dev_res,
    405                                       struct ibv_sge *sge, uint8_t num_sge,
    406                                       uint64_t *total_length)
    407{
    408    int idx;
    409
    410    for (idx = 0; idx < num_sge; idx++) {
    411        *total_length += sge[idx].length;
    412    }
    413    return 0;
    414}
    415#endif
    416
    417static void trace_mad_message(const char *title, char *buf, int len)
    418{
    419    int i;
    420    char *b = g_malloc0(len * 3 + 1);
    421    char b1[4];
    422
    423    for (i = 0; i < len; i++) {
    424        sprintf(b1, "%.2X ", buf[i] & 0x000000FF);
    425        strcat(b, b1);
    426    }
    427
    428    trace_rdma_mad_message(title, len, b);
    429
    430    g_free(b);
    431}
    432
    433static int mad_send(RdmaBackendDev *backend_dev, uint8_t sgid_idx,
    434                    union ibv_gid *sgid, struct ibv_sge *sge, uint32_t num_sge)
    435{
    436    RdmaCmMuxMsg msg = {};
    437    char *hdr, *data;
    438    int ret;
    439
    440    if (num_sge != 2) {
    441        return -EINVAL;
    442    }
    443
    444    msg.hdr.op_code = RDMACM_MUX_OP_CODE_MAD;
    445    memcpy(msg.hdr.sgid.raw, sgid->raw, sizeof(msg.hdr.sgid));
    446
    447    msg.umad_len = sge[0].length + sge[1].length;
    448
    449    if (msg.umad_len > sizeof(msg.umad.mad)) {
    450        return -ENOMEM;
    451    }
    452
    453    msg.umad.hdr.addr.qpn = htobe32(1);
    454    msg.umad.hdr.addr.grh_present = 1;
    455    msg.umad.hdr.addr.gid_index = sgid_idx;
    456    memcpy(msg.umad.hdr.addr.gid, sgid->raw, sizeof(msg.umad.hdr.addr.gid));
    457    msg.umad.hdr.addr.hop_limit = 0xFF;
    458
    459    hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length);
    460    if (!hdr) {
    461        return -ENOMEM;
    462    }
    463    data = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
    464    if (!data) {
    465        rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
    466        return -ENOMEM;
    467    }
    468
    469    memcpy(&msg.umad.mad[0], hdr, sge[0].length);
    470    memcpy(&msg.umad.mad[sge[0].length], data, sge[1].length);
    471
    472    rdma_pci_dma_unmap(backend_dev->dev, data, sge[1].length);
    473    rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
    474
    475    trace_mad_message("send", msg.umad.mad, msg.umad_len);
    476
    477    ret = rdmacm_mux_send(backend_dev, &msg);
    478    if (ret) {
    479        rdma_error_report("Failed to send MAD to rdma_umadmux (%d)", ret);
    480        return -EIO;
    481    }
    482
    483    return 0;
    484}
    485
    486void rdma_backend_post_send(RdmaBackendDev *backend_dev,
    487                            RdmaBackendQP *qp, uint8_t qp_type,
    488                            struct ibv_sge *sge, uint32_t num_sge,
    489                            uint8_t sgid_idx, union ibv_gid *sgid,
    490                            union ibv_gid *dgid, uint32_t dqpn, uint32_t dqkey,
    491                            void *ctx)
    492{
    493    BackendCtx *bctx;
    494    uint32_t bctx_id;
    495    int rc;
    496    struct ibv_send_wr wr = {}, *bad_wr;
    497
    498    if (!qp->ibqp) { /* This field is not initialized for QP0 and QP1 */
    499        if (qp_type == IBV_QPT_SMI) {
    500            rdma_error_report("Got QP0 request");
    501            complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
    502        } else if (qp_type == IBV_QPT_GSI) {
    503            rc = mad_send(backend_dev, sgid_idx, sgid, sge, num_sge);
    504            if (rc) {
    505                complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
    506                backend_dev->rdma_dev_res->stats.mad_tx_err++;
    507            } else {
    508                complete_work(IBV_WC_SUCCESS, 0, ctx);
    509                backend_dev->rdma_dev_res->stats.mad_tx++;
    510            }
    511        }
    512        return;
    513    }
    514
    515    bctx = g_malloc0(sizeof(*bctx));
    516    bctx->up_ctx = ctx;
    517    bctx->backend_qp = qp;
    518
    519    rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
    520    if (unlikely(rc)) {
    521        complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
    522        goto err_free_bctx;
    523    }
    524
    525    rdma_protected_gslist_append_int32(&qp->cqe_ctx_list, bctx_id);
    526
    527    rc = build_host_sge_array(backend_dev->rdma_dev_res, sge, num_sge,
    528                              &backend_dev->rdma_dev_res->stats.tx_len);
    529    if (rc) {
    530        complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
    531        goto err_dealloc_cqe_ctx;
    532    }
    533
    534    if (qp_type == IBV_QPT_UD) {
    535        wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd, sgid_idx, dgid);
    536        if (!wr.wr.ud.ah) {
    537            complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
    538            goto err_dealloc_cqe_ctx;
    539        }
    540        wr.wr.ud.remote_qpn = dqpn;
    541        wr.wr.ud.remote_qkey = dqkey;
    542    }
    543
    544    wr.num_sge = num_sge;
    545    wr.opcode = IBV_WR_SEND;
    546    wr.send_flags = IBV_SEND_SIGNALED;
    547    wr.sg_list = sge;
    548    wr.wr_id = bctx_id;
    549
    550    rc = ibv_post_send(qp->ibqp, &wr, &bad_wr);
    551    if (rc) {
    552        rdma_error_report("ibv_post_send fail, qpn=0x%x, rc=%d, errno=%d",
    553                          qp->ibqp->qp_num, rc, errno);
    554        complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
    555        goto err_dealloc_cqe_ctx;
    556    }
    557
    558    qatomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe);
    559    backend_dev->rdma_dev_res->stats.tx++;
    560
    561    return;
    562
    563err_dealloc_cqe_ctx:
    564    backend_dev->rdma_dev_res->stats.tx_err++;
    565    rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id);
    566
    567err_free_bctx:
    568    g_free(bctx);
    569}
    570
    571static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev,
    572                                         struct ibv_sge *sge, uint32_t num_sge,
    573                                         void *ctx)
    574{
    575    BackendCtx *bctx;
    576    int rc;
    577    uint32_t bctx_id;
    578
    579    if (num_sge != 1) {
    580        rdma_error_report("Invalid num_sge (%d), expecting 1", num_sge);
    581        return VENDOR_ERR_INV_NUM_SGE;
    582    }
    583
    584    if (sge[0].length < RDMA_MAX_PRIVATE_DATA + sizeof(struct ibv_grh)) {
    585        rdma_error_report("Too small buffer for MAD");
    586        return VENDOR_ERR_INV_MAD_BUFF;
    587    }
    588
    589    bctx = g_malloc0(sizeof(*bctx));
    590
    591    rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
    592    if (unlikely(rc)) {
    593        g_free(bctx);
    594        return VENDOR_ERR_NOMEM;
    595    }
    596
    597    bctx->up_ctx = ctx;
    598    bctx->sge = *sge;
    599
    600    rdma_protected_gqueue_append_int64(&backend_dev->recv_mads_list, bctx_id);
    601
    602    return 0;
    603}
    604
    605void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
    606                            RdmaBackendQP *qp, uint8_t qp_type,
    607                            struct ibv_sge *sge, uint32_t num_sge, void *ctx)
    608{
    609    BackendCtx *bctx;
    610    uint32_t bctx_id;
    611    int rc;
    612    struct ibv_recv_wr wr = {}, *bad_wr;
    613
    614    if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */
    615        if (qp_type == IBV_QPT_SMI) {
    616            rdma_error_report("Got QP0 request");
    617            complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
    618        }
    619        if (qp_type == IBV_QPT_GSI) {
    620            rc = save_mad_recv_buffer(backend_dev, sge, num_sge, ctx);
    621            if (rc) {
    622                complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
    623                backend_dev->rdma_dev_res->stats.mad_rx_bufs_err++;
    624            } else {
    625                backend_dev->rdma_dev_res->stats.mad_rx_bufs++;
    626            }
    627        }
    628        return;
    629    }
    630
    631    bctx = g_malloc0(sizeof(*bctx));
    632    bctx->up_ctx = ctx;
    633    bctx->backend_qp = qp;
    634
    635    rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
    636    if (unlikely(rc)) {
    637        complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
    638        goto err_free_bctx;
    639    }
    640
    641    rdma_protected_gslist_append_int32(&qp->cqe_ctx_list, bctx_id);
    642
    643    rc = build_host_sge_array(backend_dev->rdma_dev_res, sge, num_sge,
    644                              &backend_dev->rdma_dev_res->stats.rx_bufs_len);
    645    if (rc) {
    646        complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
    647        goto err_dealloc_cqe_ctx;
    648    }
    649
    650    wr.num_sge = num_sge;
    651    wr.sg_list = sge;
    652    wr.wr_id = bctx_id;
    653    rc = ibv_post_recv(qp->ibqp, &wr, &bad_wr);
    654    if (rc) {
    655        rdma_error_report("ibv_post_recv fail, qpn=0x%x, rc=%d, errno=%d",
    656                          qp->ibqp->qp_num, rc, errno);
    657        complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
    658        goto err_dealloc_cqe_ctx;
    659    }
    660
    661    qatomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe);
    662    backend_dev->rdma_dev_res->stats.rx_bufs++;
    663
    664    return;
    665
    666err_dealloc_cqe_ctx:
    667    backend_dev->rdma_dev_res->stats.rx_bufs_err++;
    668    rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id);
    669
    670err_free_bctx:
    671    g_free(bctx);
    672}
    673
    674void rdma_backend_post_srq_recv(RdmaBackendDev *backend_dev,
    675                                RdmaBackendSRQ *srq, struct ibv_sge *sge,
    676                                uint32_t num_sge, void *ctx)
    677{
    678    BackendCtx *bctx;
    679    uint32_t bctx_id;
    680    int rc;
    681    struct ibv_recv_wr wr = {}, *bad_wr;
    682
    683    bctx = g_malloc0(sizeof(*bctx));
    684    bctx->up_ctx = ctx;
    685    bctx->backend_srq = srq;
    686
    687    rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
    688    if (unlikely(rc)) {
    689        complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
    690        goto err_free_bctx;
    691    }
    692
    693    rdma_protected_gslist_append_int32(&srq->cqe_ctx_list, bctx_id);
    694
    695    rc = build_host_sge_array(backend_dev->rdma_dev_res, sge, num_sge,
    696                              &backend_dev->rdma_dev_res->stats.rx_bufs_len);
    697    if (rc) {
    698        complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
    699        goto err_dealloc_cqe_ctx;
    700    }
    701
    702    wr.num_sge = num_sge;
    703    wr.sg_list = sge;
    704    wr.wr_id = bctx_id;
    705    rc = ibv_post_srq_recv(srq->ibsrq, &wr, &bad_wr);
    706    if (rc) {
    707        rdma_error_report("ibv_post_srq_recv fail, srqn=0x%x, rc=%d, errno=%d",
    708                          srq->ibsrq->handle, rc, errno);
    709        complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
    710        goto err_dealloc_cqe_ctx;
    711    }
    712
    713    qatomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe);
    714    backend_dev->rdma_dev_res->stats.rx_bufs++;
    715    backend_dev->rdma_dev_res->stats.rx_srq++;
    716
    717    return;
    718
    719err_dealloc_cqe_ctx:
    720    backend_dev->rdma_dev_res->stats.rx_bufs_err++;
    721    rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id);
    722
    723err_free_bctx:
    724    g_free(bctx);
    725}
    726
    727int rdma_backend_create_pd(RdmaBackendDev *backend_dev, RdmaBackendPD *pd)
    728{
    729    pd->ibpd = ibv_alloc_pd(backend_dev->context);
    730
    731    if (!pd->ibpd) {
    732        rdma_error_report("ibv_alloc_pd fail, errno=%d", errno);
    733        return -EIO;
    734    }
    735
    736    return 0;
    737}
    738
    739void rdma_backend_destroy_pd(RdmaBackendPD *pd)
    740{
    741    if (pd->ibpd) {
    742        ibv_dealloc_pd(pd->ibpd);
    743    }
    744}
    745
    746int rdma_backend_create_mr(RdmaBackendMR *mr, RdmaBackendPD *pd, void *addr,
    747                           size_t length, uint64_t guest_start, int access)
    748{
    749#ifdef LEGACY_RDMA_REG_MR
    750    mr->ibmr = ibv_reg_mr(pd->ibpd, addr, length, access);
    751#else
    752    mr->ibmr = ibv_reg_mr_iova(pd->ibpd, addr, length, guest_start, access);
    753#endif
    754    if (!mr->ibmr) {
    755        rdma_error_report("ibv_reg_mr fail, errno=%d", errno);
    756        return -EIO;
    757    }
    758
    759    mr->ibpd = pd->ibpd;
    760
    761    return 0;
    762}
    763
    764void rdma_backend_destroy_mr(RdmaBackendMR *mr)
    765{
    766    if (mr->ibmr) {
    767        ibv_dereg_mr(mr->ibmr);
    768    }
    769}
    770
    771int rdma_backend_create_cq(RdmaBackendDev *backend_dev, RdmaBackendCQ *cq,
    772                           int cqe)
    773{
    774    int rc;
    775
    776    cq->ibcq = ibv_create_cq(backend_dev->context, cqe + 1, NULL,
    777                             backend_dev->channel, 0);
    778    if (!cq->ibcq) {
    779        rdma_error_report("ibv_create_cq fail, errno=%d", errno);
    780        return -EIO;
    781    }
    782
    783    rc = ibv_req_notify_cq(cq->ibcq, 0);
    784    if (rc) {
    785        rdma_warn_report("ibv_req_notify_cq fail, rc=%d, errno=%d", rc, errno);
    786    }
    787
    788    cq->backend_dev = backend_dev;
    789
    790    return 0;
    791}
    792
    793void rdma_backend_destroy_cq(RdmaBackendCQ *cq)
    794{
    795    if (cq->ibcq) {
    796        ibv_destroy_cq(cq->ibcq);
    797    }
    798}
    799
    800int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type,
    801                           RdmaBackendPD *pd, RdmaBackendCQ *scq,
    802                           RdmaBackendCQ *rcq, RdmaBackendSRQ *srq,
    803                           uint32_t max_send_wr, uint32_t max_recv_wr,
    804                           uint32_t max_send_sge, uint32_t max_recv_sge)
    805{
    806    struct ibv_qp_init_attr attr = {};
    807
    808    qp->ibqp = 0;
    809
    810    switch (qp_type) {
    811    case IBV_QPT_GSI:
    812        return 0;
    813
    814    case IBV_QPT_RC:
    815        /* fall through */
    816    case IBV_QPT_UD:
    817        /* do nothing */
    818        break;
    819
    820    default:
    821        rdma_error_report("Unsupported QP type %d", qp_type);
    822        return -EIO;
    823    }
    824
    825    attr.qp_type = qp_type;
    826    attr.send_cq = scq->ibcq;
    827    attr.recv_cq = rcq->ibcq;
    828    attr.cap.max_send_wr = max_send_wr;
    829    attr.cap.max_recv_wr = max_recv_wr;
    830    attr.cap.max_send_sge = max_send_sge;
    831    attr.cap.max_recv_sge = max_recv_sge;
    832    if (srq) {
    833        attr.srq = srq->ibsrq;
    834    }
    835
    836    qp->ibqp = ibv_create_qp(pd->ibpd, &attr);
    837    if (!qp->ibqp) {
    838        rdma_error_report("ibv_create_qp fail, errno=%d", errno);
    839        return -EIO;
    840    }
    841
    842    rdma_protected_gslist_init(&qp->cqe_ctx_list);
    843
    844    qp->ibpd = pd->ibpd;
    845
    846    /* TODO: Query QP to get max_inline_data and save it to be used in send */
    847
    848    return 0;
    849}
    850
    851int rdma_backend_qp_state_init(RdmaBackendDev *backend_dev, RdmaBackendQP *qp,
    852                               uint8_t qp_type, uint32_t qkey)
    853{
    854    struct ibv_qp_attr attr = {};
    855    int rc, attr_mask;
    856
    857    attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT;
    858    attr.qp_state        = IBV_QPS_INIT;
    859    attr.pkey_index      = 0;
    860    attr.port_num        = backend_dev->port_num;
    861
    862    switch (qp_type) {
    863    case IBV_QPT_RC:
    864        attr_mask |= IBV_QP_ACCESS_FLAGS;
    865        trace_rdma_backend_rc_qp_state_init(qp->ibqp->qp_num);
    866        break;
    867
    868    case IBV_QPT_UD:
    869        attr.qkey = qkey;
    870        attr_mask |= IBV_QP_QKEY;
    871        trace_rdma_backend_ud_qp_state_init(qp->ibqp->qp_num, qkey);
    872        break;
    873
    874    default:
    875        rdma_error_report("Unsupported QP type %d", qp_type);
    876        return -EIO;
    877    }
    878
    879    rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask);
    880    if (rc) {
    881        rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno);
    882        return -EIO;
    883    }
    884
    885    return 0;
    886}
    887
    888int rdma_backend_qp_state_rtr(RdmaBackendDev *backend_dev, RdmaBackendQP *qp,
    889                              uint8_t qp_type, uint8_t sgid_idx,
    890                              union ibv_gid *dgid, uint32_t dqpn,
    891                              uint32_t rq_psn, uint32_t qkey, bool use_qkey)
    892{
    893    struct ibv_qp_attr attr = {};
    894    union ibv_gid ibv_gid = {
    895        .global.interface_id = dgid->global.interface_id,
    896        .global.subnet_prefix = dgid->global.subnet_prefix
    897    };
    898    int rc, attr_mask;
    899
    900    attr.qp_state = IBV_QPS_RTR;
    901    attr_mask = IBV_QP_STATE;
    902
    903    qp->sgid_idx = sgid_idx;
    904
    905    switch (qp_type) {
    906    case IBV_QPT_RC:
    907        attr.path_mtu               = IBV_MTU_1024;
    908        attr.dest_qp_num            = dqpn;
    909        attr.max_dest_rd_atomic     = 1;
    910        attr.min_rnr_timer          = 12;
    911        attr.ah_attr.port_num       = backend_dev->port_num;
    912        attr.ah_attr.is_global      = 1;
    913        attr.ah_attr.grh.hop_limit  = 1;
    914        attr.ah_attr.grh.dgid       = ibv_gid;
    915        attr.ah_attr.grh.sgid_index = qp->sgid_idx;
    916        attr.rq_psn                 = rq_psn;
    917
    918        attr_mask |= IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN |
    919                     IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC |
    920                     IBV_QP_MIN_RNR_TIMER;
    921
    922        trace_rdma_backend_rc_qp_state_rtr(qp->ibqp->qp_num,
    923                                           be64_to_cpu(ibv_gid.global.
    924                                                       subnet_prefix),
    925                                           be64_to_cpu(ibv_gid.global.
    926                                                       interface_id),
    927                                           qp->sgid_idx, dqpn, rq_psn);
    928        break;
    929
    930    case IBV_QPT_UD:
    931        if (use_qkey) {
    932            attr.qkey = qkey;
    933            attr_mask |= IBV_QP_QKEY;
    934        }
    935        trace_rdma_backend_ud_qp_state_rtr(qp->ibqp->qp_num, use_qkey ? qkey :
    936                                           0);
    937        break;
    938    }
    939
    940    rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask);
    941    if (rc) {
    942        rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno);
    943        return -EIO;
    944    }
    945
    946    return 0;
    947}
    948
    949int rdma_backend_qp_state_rts(RdmaBackendQP *qp, uint8_t qp_type,
    950                              uint32_t sq_psn, uint32_t qkey, bool use_qkey)
    951{
    952    struct ibv_qp_attr attr = {};
    953    int rc, attr_mask;
    954
    955    attr.qp_state = IBV_QPS_RTS;
    956    attr.sq_psn = sq_psn;
    957    attr_mask = IBV_QP_STATE | IBV_QP_SQ_PSN;
    958
    959    switch (qp_type) {
    960    case IBV_QPT_RC:
    961        attr.timeout       = 14;
    962        attr.retry_cnt     = 7;
    963        attr.rnr_retry     = 7;
    964        attr.max_rd_atomic = 1;
    965
    966        attr_mask |= IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY |
    967                     IBV_QP_MAX_QP_RD_ATOMIC;
    968        trace_rdma_backend_rc_qp_state_rts(qp->ibqp->qp_num, sq_psn);
    969        break;
    970
    971    case IBV_QPT_UD:
    972        if (use_qkey) {
    973            attr.qkey = qkey;
    974            attr_mask |= IBV_QP_QKEY;
    975        }
    976        trace_rdma_backend_ud_qp_state_rts(qp->ibqp->qp_num, sq_psn,
    977                                           use_qkey ? qkey : 0);
    978        break;
    979    }
    980
    981    rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask);
    982    if (rc) {
    983        rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno);
    984        return -EIO;
    985    }
    986
    987    return 0;
    988}
    989
    990int rdma_backend_query_qp(RdmaBackendQP *qp, struct ibv_qp_attr *attr,
    991                          int attr_mask, struct ibv_qp_init_attr *init_attr)
    992{
    993    if (!qp->ibqp) {
    994        attr->qp_state = IBV_QPS_RTS;
    995        return 0;
    996    }
    997
    998    return ibv_query_qp(qp->ibqp, attr, attr_mask, init_attr);
    999}
   1000
   1001void rdma_backend_destroy_qp(RdmaBackendQP *qp, RdmaDeviceResources *dev_res)
   1002{
   1003    if (qp->ibqp) {
   1004        ibv_destroy_qp(qp->ibqp);
   1005    }
   1006    g_slist_foreach(qp->cqe_ctx_list.list, free_cqe_ctx, dev_res);
   1007    rdma_protected_gslist_destroy(&qp->cqe_ctx_list);
   1008}
   1009
   1010int rdma_backend_create_srq(RdmaBackendSRQ *srq, RdmaBackendPD *pd,
   1011                            uint32_t max_wr, uint32_t max_sge,
   1012                            uint32_t srq_limit)
   1013{
   1014    struct ibv_srq_init_attr srq_init_attr = {};
   1015
   1016    srq_init_attr.attr.max_wr = max_wr;
   1017    srq_init_attr.attr.max_sge = max_sge;
   1018    srq_init_attr.attr.srq_limit = srq_limit;
   1019
   1020    srq->ibsrq = ibv_create_srq(pd->ibpd, &srq_init_attr);
   1021    if (!srq->ibsrq) {
   1022        rdma_error_report("ibv_create_srq failed, errno=%d", errno);
   1023        return -EIO;
   1024    }
   1025
   1026    rdma_protected_gslist_init(&srq->cqe_ctx_list);
   1027
   1028    return 0;
   1029}
   1030
   1031int rdma_backend_query_srq(RdmaBackendSRQ *srq, struct ibv_srq_attr *srq_attr)
   1032{
   1033    if (!srq->ibsrq) {
   1034        return -EINVAL;
   1035    }
   1036
   1037    return ibv_query_srq(srq->ibsrq, srq_attr);
   1038}
   1039
   1040int rdma_backend_modify_srq(RdmaBackendSRQ *srq, struct ibv_srq_attr *srq_attr,
   1041                int srq_attr_mask)
   1042{
   1043    if (!srq->ibsrq) {
   1044        return -EINVAL;
   1045    }
   1046
   1047    return ibv_modify_srq(srq->ibsrq, srq_attr, srq_attr_mask);
   1048}
   1049
   1050void rdma_backend_destroy_srq(RdmaBackendSRQ *srq, RdmaDeviceResources *dev_res)
   1051{
   1052    if (srq->ibsrq) {
   1053        ibv_destroy_srq(srq->ibsrq);
   1054    }
   1055    g_slist_foreach(srq->cqe_ctx_list.list, free_cqe_ctx, dev_res);
   1056    rdma_protected_gslist_destroy(&srq->cqe_ctx_list);
   1057}
   1058
   1059#define CHK_ATTR(req, dev, member, fmt) ({ \
   1060    trace_rdma_check_dev_attr(#member, dev.member, req->member); \
   1061    if (req->member > dev.member) { \
   1062        rdma_warn_report("%s = "fmt" is higher than host device capability "fmt, \
   1063                         #member, req->member, dev.member); \
   1064        req->member = dev.member; \
   1065    } \
   1066})
   1067
   1068static int init_device_caps(RdmaBackendDev *backend_dev,
   1069                            struct ibv_device_attr *dev_attr)
   1070{
   1071    struct ibv_device_attr bk_dev_attr;
   1072    int rc;
   1073
   1074    rc = ibv_query_device(backend_dev->context, &bk_dev_attr);
   1075    if (rc) {
   1076        rdma_error_report("ibv_query_device fail, rc=%d, errno=%d", rc, errno);
   1077        return -EIO;
   1078    }
   1079
   1080    dev_attr->max_sge = MAX_SGE;
   1081    dev_attr->max_srq_sge = MAX_SGE;
   1082
   1083    CHK_ATTR(dev_attr, bk_dev_attr, max_mr_size, "%" PRId64);
   1084    CHK_ATTR(dev_attr, bk_dev_attr, max_qp, "%d");
   1085    CHK_ATTR(dev_attr, bk_dev_attr, max_sge, "%d");
   1086    CHK_ATTR(dev_attr, bk_dev_attr, max_cq, "%d");
   1087    CHK_ATTR(dev_attr, bk_dev_attr, max_mr, "%d");
   1088    CHK_ATTR(dev_attr, bk_dev_attr, max_pd, "%d");
   1089    CHK_ATTR(dev_attr, bk_dev_attr, max_qp_rd_atom, "%d");
   1090    CHK_ATTR(dev_attr, bk_dev_attr, max_qp_init_rd_atom, "%d");
   1091    CHK_ATTR(dev_attr, bk_dev_attr, max_ah, "%d");
   1092    CHK_ATTR(dev_attr, bk_dev_attr, max_srq, "%d");
   1093
   1094    return 0;
   1095}
   1096
   1097static inline void build_mad_hdr(struct ibv_grh *grh, union ibv_gid *sgid,
   1098                                 union ibv_gid *my_gid, int paylen)
   1099{
   1100    grh->paylen = htons(paylen);
   1101    grh->sgid = *sgid;
   1102    grh->dgid = *my_gid;
   1103}
   1104
   1105static void process_incoming_mad_req(RdmaBackendDev *backend_dev,
   1106                                     RdmaCmMuxMsg *msg)
   1107{
   1108    unsigned long cqe_ctx_id;
   1109    BackendCtx *bctx;
   1110    char *mad;
   1111
   1112    trace_mad_message("recv", msg->umad.mad, msg->umad_len);
   1113
   1114    cqe_ctx_id = rdma_protected_gqueue_pop_int64(&backend_dev->recv_mads_list);
   1115    if (cqe_ctx_id == -ENOENT) {
   1116        rdma_warn_report("No more free MADs buffers, waiting for a while");
   1117        sleep(THR_POLL_TO);
   1118        return;
   1119    }
   1120
   1121    bctx = rdma_rm_get_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id);
   1122    if (unlikely(!bctx)) {
   1123        rdma_error_report("No matching ctx for req %ld", cqe_ctx_id);
   1124        backend_dev->rdma_dev_res->stats.mad_rx_err++;
   1125        return;
   1126    }
   1127
   1128    mad = rdma_pci_dma_map(backend_dev->dev, bctx->sge.addr,
   1129                           bctx->sge.length);
   1130    if (!mad || bctx->sge.length < msg->umad_len + MAD_HDR_SIZE) {
   1131        backend_dev->rdma_dev_res->stats.mad_rx_err++;
   1132        complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_INV_MAD_BUFF,
   1133                      bctx->up_ctx);
   1134    } else {
   1135        struct ibv_wc wc = {};
   1136        memset(mad, 0, bctx->sge.length);
   1137        build_mad_hdr((struct ibv_grh *)mad,
   1138                      (union ibv_gid *)&msg->umad.hdr.addr.gid, &msg->hdr.sgid,
   1139                      msg->umad_len);
   1140        memcpy(&mad[MAD_HDR_SIZE], msg->umad.mad, msg->umad_len);
   1141        rdma_pci_dma_unmap(backend_dev->dev, mad, bctx->sge.length);
   1142
   1143        wc.byte_len = msg->umad_len;
   1144        wc.status = IBV_WC_SUCCESS;
   1145        wc.wc_flags = IBV_WC_GRH;
   1146        backend_dev->rdma_dev_res->stats.mad_rx++;
   1147        comp_handler(bctx->up_ctx, &wc);
   1148    }
   1149
   1150    g_free(bctx);
   1151    rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id);
   1152}
   1153
   1154static inline int rdmacm_mux_can_receive(void *opaque)
   1155{
   1156    RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque;
   1157
   1158    return rdmacm_mux_can_process_async(backend_dev);
   1159}
   1160
   1161static void rdmacm_mux_read(void *opaque, const uint8_t *buf, int size)
   1162{
   1163    RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque;
   1164    RdmaCmMuxMsg *msg = (RdmaCmMuxMsg *)buf;
   1165
   1166    trace_rdmacm_mux("read", msg->hdr.msg_type, msg->hdr.op_code);
   1167
   1168    if (msg->hdr.msg_type != RDMACM_MUX_MSG_TYPE_REQ &&
   1169        msg->hdr.op_code != RDMACM_MUX_OP_CODE_MAD) {
   1170            rdma_error_report("Error: Not a MAD request, skipping");
   1171            return;
   1172    }
   1173    process_incoming_mad_req(backend_dev, msg);
   1174}
   1175
   1176static int mad_init(RdmaBackendDev *backend_dev, CharBackend *mad_chr_be)
   1177{
   1178    int ret;
   1179
   1180    backend_dev->rdmacm_mux.chr_be = mad_chr_be;
   1181
   1182    ret = qemu_chr_fe_backend_connected(backend_dev->rdmacm_mux.chr_be);
   1183    if (!ret) {
   1184        rdma_error_report("Missing chardev for MAD multiplexer");
   1185        return -EIO;
   1186    }
   1187
   1188    rdma_protected_gqueue_init(&backend_dev->recv_mads_list);
   1189
   1190    enable_rdmacm_mux_async(backend_dev);
   1191
   1192    qemu_chr_fe_set_handlers(backend_dev->rdmacm_mux.chr_be,
   1193                             rdmacm_mux_can_receive, rdmacm_mux_read, NULL,
   1194                             NULL, backend_dev, NULL, true);
   1195
   1196    return 0;
   1197}
   1198
   1199static void mad_stop(RdmaBackendDev *backend_dev)
   1200{
   1201    clean_recv_mads(backend_dev);
   1202}
   1203
   1204static void mad_fini(RdmaBackendDev *backend_dev)
   1205{
   1206    disable_rdmacm_mux_async(backend_dev);
   1207    qemu_chr_fe_disconnect(backend_dev->rdmacm_mux.chr_be);
   1208    rdma_protected_gqueue_destroy(&backend_dev->recv_mads_list);
   1209}
   1210
   1211int rdma_backend_get_gid_index(RdmaBackendDev *backend_dev,
   1212                               union ibv_gid *gid)
   1213{
   1214    union ibv_gid sgid;
   1215    int ret;
   1216    int i = 0;
   1217
   1218    do {
   1219        ret = ibv_query_gid(backend_dev->context, backend_dev->port_num, i,
   1220                            &sgid);
   1221        i++;
   1222    } while (!ret && (memcmp(&sgid, gid, sizeof(*gid))));
   1223
   1224    trace_rdma_backend_get_gid_index(be64_to_cpu(gid->global.subnet_prefix),
   1225                                     be64_to_cpu(gid->global.interface_id),
   1226                                     i - 1);
   1227
   1228    return ret ? ret : i - 1;
   1229}
   1230
   1231int rdma_backend_add_gid(RdmaBackendDev *backend_dev, const char *ifname,
   1232                         union ibv_gid *gid)
   1233{
   1234    RdmaCmMuxMsg msg = {};
   1235    int ret;
   1236
   1237    trace_rdma_backend_gid_change("add", be64_to_cpu(gid->global.subnet_prefix),
   1238                                  be64_to_cpu(gid->global.interface_id));
   1239
   1240    msg.hdr.op_code = RDMACM_MUX_OP_CODE_REG;
   1241    memcpy(msg.hdr.sgid.raw, gid->raw, sizeof(msg.hdr.sgid));
   1242
   1243    ret = rdmacm_mux_send(backend_dev, &msg);
   1244    if (ret) {
   1245        rdma_error_report("Failed to register GID to rdma_umadmux (%d)", ret);
   1246        return -EIO;
   1247    }
   1248
   1249    qapi_event_send_rdma_gid_status_changed(ifname, true,
   1250                                            gid->global.subnet_prefix,
   1251                                            gid->global.interface_id);
   1252
   1253    return ret;
   1254}
   1255
   1256int rdma_backend_del_gid(RdmaBackendDev *backend_dev, const char *ifname,
   1257                         union ibv_gid *gid)
   1258{
   1259    RdmaCmMuxMsg msg = {};
   1260    int ret;
   1261
   1262    trace_rdma_backend_gid_change("del", be64_to_cpu(gid->global.subnet_prefix),
   1263                                  be64_to_cpu(gid->global.interface_id));
   1264
   1265    msg.hdr.op_code = RDMACM_MUX_OP_CODE_UNREG;
   1266    memcpy(msg.hdr.sgid.raw, gid->raw, sizeof(msg.hdr.sgid));
   1267
   1268    ret = rdmacm_mux_send(backend_dev, &msg);
   1269    if (ret) {
   1270        rdma_error_report("Failed to unregister GID from rdma_umadmux (%d)",
   1271                          ret);
   1272        return -EIO;
   1273    }
   1274
   1275    qapi_event_send_rdma_gid_status_changed(ifname, false,
   1276                                            gid->global.subnet_prefix,
   1277                                            gid->global.interface_id);
   1278
   1279    return 0;
   1280}
   1281
   1282int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
   1283                      RdmaDeviceResources *rdma_dev_res,
   1284                      const char *backend_device_name, uint8_t port_num,
   1285                      struct ibv_device_attr *dev_attr, CharBackend *mad_chr_be)
   1286{
   1287    int i;
   1288    int ret = 0;
   1289    int num_ibv_devices;
   1290    struct ibv_device **dev_list;
   1291
   1292    memset(backend_dev, 0, sizeof(*backend_dev));
   1293
   1294    backend_dev->dev = pdev;
   1295    backend_dev->port_num = port_num;
   1296    backend_dev->rdma_dev_res = rdma_dev_res;
   1297
   1298    rdma_backend_register_comp_handler(dummy_comp_handler);
   1299
   1300    dev_list = ibv_get_device_list(&num_ibv_devices);
   1301    if (!dev_list) {
   1302        rdma_error_report("Failed to get IB devices list");
   1303        return -EIO;
   1304    }
   1305
   1306    if (num_ibv_devices == 0) {
   1307        rdma_error_report("No IB devices were found");
   1308        ret = -ENXIO;
   1309        goto out_free_dev_list;
   1310    }
   1311
   1312    if (backend_device_name) {
   1313        for (i = 0; dev_list[i]; ++i) {
   1314            if (!strcmp(ibv_get_device_name(dev_list[i]),
   1315                        backend_device_name)) {
   1316                break;
   1317            }
   1318        }
   1319
   1320        backend_dev->ib_dev = dev_list[i];
   1321        if (!backend_dev->ib_dev) {
   1322            rdma_error_report("Failed to find IB device %s",
   1323                              backend_device_name);
   1324            ret = -EIO;
   1325            goto out_free_dev_list;
   1326        }
   1327    } else {
   1328        backend_dev->ib_dev = *dev_list;
   1329    }
   1330
   1331    rdma_info_report("uverb device %s", backend_dev->ib_dev->dev_name);
   1332
   1333    backend_dev->context = ibv_open_device(backend_dev->ib_dev);
   1334    if (!backend_dev->context) {
   1335        rdma_error_report("Failed to open IB device %s",
   1336                          ibv_get_device_name(backend_dev->ib_dev));
   1337        ret = -EIO;
   1338        goto out;
   1339    }
   1340
   1341    backend_dev->channel = ibv_create_comp_channel(backend_dev->context);
   1342    if (!backend_dev->channel) {
   1343        rdma_error_report("Failed to create IB communication channel");
   1344        ret = -EIO;
   1345        goto out_close_device;
   1346    }
   1347
   1348    ret = init_device_caps(backend_dev, dev_attr);
   1349    if (ret) {
   1350        rdma_error_report("Failed to initialize device capabilities");
   1351        ret = -EIO;
   1352        goto out_destroy_comm_channel;
   1353    }
   1354
   1355
   1356    ret = mad_init(backend_dev, mad_chr_be);
   1357    if (ret) {
   1358        rdma_error_report("Failed to initialize mad");
   1359        ret = -EIO;
   1360        goto out_destroy_comm_channel;
   1361    }
   1362
   1363    backend_dev->comp_thread.run = false;
   1364    backend_dev->comp_thread.is_running = false;
   1365
   1366    ah_cache_init();
   1367
   1368    goto out_free_dev_list;
   1369
   1370out_destroy_comm_channel:
   1371    ibv_destroy_comp_channel(backend_dev->channel);
   1372
   1373out_close_device:
   1374    ibv_close_device(backend_dev->context);
   1375
   1376out_free_dev_list:
   1377    ibv_free_device_list(dev_list);
   1378
   1379out:
   1380    return ret;
   1381}
   1382
   1383
   1384void rdma_backend_start(RdmaBackendDev *backend_dev)
   1385{
   1386    start_comp_thread(backend_dev);
   1387}
   1388
   1389void rdma_backend_stop(RdmaBackendDev *backend_dev)
   1390{
   1391    mad_stop(backend_dev);
   1392    stop_backend_thread(&backend_dev->comp_thread);
   1393}
   1394
   1395void rdma_backend_fini(RdmaBackendDev *backend_dev)
   1396{
   1397    mad_fini(backend_dev);
   1398    g_hash_table_destroy(ah_hash);
   1399    ibv_destroy_comp_channel(backend_dev->channel);
   1400    ibv_close_device(backend_dev->context);
   1401}