spapr_pci_nvlink2.c - cachepc-qemu - Fork of AMDESE/qemu with changes for cachepc side-channel attack

	cachepc-qemu Fork of AMDESE/qemu with changes for cachepc side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-qemu
	Log \| Files \| Refs \| Submodules \| LICENSE \| sfeed.txt
spapr_pci_nvlink2.c (15564B)
      1/*
      2 * QEMU sPAPR PCI for NVLink2 pass through
      3 *
      4 * Copyright (c) 2019 Alexey Kardashevskiy, IBM Corporation.
      5 *
      6 * Permission is hereby granted, free of charge, to any person obtaining a copy
      7 * of this software and associated documentation files (the "Software"), to deal
      8 * in the Software without restriction, including without limitation the rights
      9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10 * copies of the Software, and to permit persons to whom the Software is
     11 * furnished to do so, subject to the following conditions:
     12 *
     13 * The above copyright notice and this permission notice shall be included in
     14 * all copies or substantial portions of the Software.
     15 *
     16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22 * THE SOFTWARE.
     23 */
     24#include "qemu/osdep.h"
     25#include "qapi/error.h"
     26#include "qemu-common.h"
     27#include "hw/pci/pci.h"
     28#include "hw/pci-host/spapr.h"
     29#include "hw/ppc/spapr_numa.h"
     30#include "qemu/error-report.h"
     31#include "hw/ppc/fdt.h"
     32#include "hw/pci/pci_bridge.h"
     33
     34#define PHANDLE_PCIDEV(phb, pdev)    (0x12000000 | \
     35                                     (((phb)->index) << 16) | ((pdev)->devfn))
     36#define PHANDLE_GPURAM(phb, n)       (0x110000FF | ((n) << 8) | \
     37                                     (((phb)->index) << 16))
     38#define PHANDLE_NVLINK(phb, gn, nn)  (0x00130000 | (((phb)->index) << 8) | \
     39                                     ((gn) << 4) | (nn))
     40
     41typedef struct SpaprPhbPciNvGpuSlot {
     42        uint64_t tgt;
     43        uint64_t gpa;
     44        unsigned numa_id;
     45        PCIDevice *gpdev;
     46        int linknum;
     47        struct {
     48            uint64_t atsd_gpa;
     49            PCIDevice *npdev;
     50            uint32_t link_speed;
     51        } links[NVGPU_MAX_LINKS];
     52} SpaprPhbPciNvGpuSlot;
     53
     54struct SpaprPhbPciNvGpuConfig {
     55    uint64_t nv2_ram_current;
     56    uint64_t nv2_atsd_current;
     57    int num; /* number of non empty (i.e. tgt!=0) entries in slots[] */
     58    SpaprPhbPciNvGpuSlot slots[NVGPU_MAX_NUM];
     59    Error *err;
     60};
     61
     62static SpaprPhbPciNvGpuSlot *
     63spapr_nvgpu_get_slot(SpaprPhbPciNvGpuConfig *nvgpus, uint64_t tgt)
     64{
     65    int i;
     66
     67    /* Search for partially collected "slot" */
     68    for (i = 0; i < nvgpus->num; ++i) {
     69        if (nvgpus->slots[i].tgt == tgt) {
     70            return &nvgpus->slots[i];
     71        }
     72    }
     73
     74    if (nvgpus->num == ARRAY_SIZE(nvgpus->slots)) {
     75        return NULL;
     76    }
     77
     78    i = nvgpus->num;
     79    nvgpus->slots[i].tgt = tgt;
     80    ++nvgpus->num;
     81
     82    return &nvgpus->slots[i];
     83}
     84
     85static void spapr_pci_collect_nvgpu(SpaprPhbPciNvGpuConfig *nvgpus,
     86                                    PCIDevice *pdev, uint64_t tgt,
     87                                    MemoryRegion *mr, Error **errp)
     88{
     89    MachineState *machine = MACHINE(qdev_get_machine());
     90    SpaprMachineState *spapr = SPAPR_MACHINE(machine);
     91    SpaprPhbPciNvGpuSlot *nvslot = spapr_nvgpu_get_slot(nvgpus, tgt);
     92
     93    if (!nvslot) {
     94        error_setg(errp, "Found too many GPUs per vPHB");
     95        return;
     96    }
     97    g_assert(!nvslot->gpdev);
     98    nvslot->gpdev = pdev;
     99
    100    nvslot->gpa = nvgpus->nv2_ram_current;
    101    nvgpus->nv2_ram_current += memory_region_size(mr);
    102    nvslot->numa_id = spapr->gpu_numa_id;
    103    ++spapr->gpu_numa_id;
    104}
    105
    106static void spapr_pci_collect_nvnpu(SpaprPhbPciNvGpuConfig *nvgpus,
    107                                    PCIDevice *pdev, uint64_t tgt,
    108                                    MemoryRegion *mr, Error **errp)
    109{
    110    SpaprPhbPciNvGpuSlot *nvslot = spapr_nvgpu_get_slot(nvgpus, tgt);
    111    int j;
    112
    113    if (!nvslot) {
    114        error_setg(errp, "Found too many NVLink bridges per vPHB");
    115        return;
    116    }
    117
    118    j = nvslot->linknum;
    119    if (j == ARRAY_SIZE(nvslot->links)) {
    120        error_setg(errp, "Found too many NVLink bridges per GPU");
    121        return;
    122    }
    123    ++nvslot->linknum;
    124
    125    g_assert(!nvslot->links[j].npdev);
    126    nvslot->links[j].npdev = pdev;
    127    nvslot->links[j].atsd_gpa = nvgpus->nv2_atsd_current;
    128    nvgpus->nv2_atsd_current += memory_region_size(mr);
    129    nvslot->links[j].link_speed =
    130        object_property_get_uint(OBJECT(pdev), "nvlink2-link-speed", NULL);
    131}
    132
    133static void spapr_phb_pci_collect_nvgpu(PCIBus *bus, PCIDevice *pdev,
    134                                        void *opaque)
    135{
    136    PCIBus *sec_bus;
    137    Object *po = OBJECT(pdev);
    138    uint64_t tgt = object_property_get_uint(po, "nvlink2-tgt", NULL);
    139
    140    if (tgt) {
    141        Error *local_err = NULL;
    142        SpaprPhbPciNvGpuConfig *nvgpus = opaque;
    143        Object *mr_gpu = object_property_get_link(po, "nvlink2-mr[0]", NULL);
    144        Object *mr_npu = object_property_get_link(po, "nvlink2-atsd-mr[0]",
    145                                                  NULL);
    146
    147        g_assert(mr_gpu || mr_npu);
    148        if (mr_gpu) {
    149            spapr_pci_collect_nvgpu(nvgpus, pdev, tgt, MEMORY_REGION(mr_gpu),
    150                                    &local_err);
    151        } else {
    152            spapr_pci_collect_nvnpu(nvgpus, pdev, tgt, MEMORY_REGION(mr_npu),
    153                                    &local_err);
    154        }
    155        error_propagate(&nvgpus->err, local_err);
    156    }
    157    if ((pci_default_read_config(pdev, PCI_HEADER_TYPE, 1) !=
    158         PCI_HEADER_TYPE_BRIDGE)) {
    159        return;
    160    }
    161
    162    sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev));
    163    if (!sec_bus) {
    164        return;
    165    }
    166
    167    pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
    168                        spapr_phb_pci_collect_nvgpu, opaque);
    169}
    170
    171void spapr_phb_nvgpu_setup(SpaprPhbState *sphb, Error **errp)
    172{
    173    int i, j, valid_gpu_num;
    174    PCIBus *bus;
    175
    176    /* Search for GPUs and NPUs */
    177    if (!sphb->nv2_gpa_win_addr || !sphb->nv2_atsd_win_addr) {
    178        return;
    179    }
    180
    181    sphb->nvgpus = g_new0(SpaprPhbPciNvGpuConfig, 1);
    182    sphb->nvgpus->nv2_ram_current = sphb->nv2_gpa_win_addr;
    183    sphb->nvgpus->nv2_atsd_current = sphb->nv2_atsd_win_addr;
    184
    185    bus = PCI_HOST_BRIDGE(sphb)->bus;
    186    pci_for_each_device(bus, pci_bus_num(bus),
    187                        spapr_phb_pci_collect_nvgpu, sphb->nvgpus);
    188
    189    if (sphb->nvgpus->err) {
    190        error_propagate(errp, sphb->nvgpus->err);
    191        sphb->nvgpus->err = NULL;
    192        goto cleanup_exit;
    193    }
    194
    195    /* Add found GPU RAM and ATSD MRs if found */
    196    for (i = 0, valid_gpu_num = 0; i < sphb->nvgpus->num; ++i) {
    197        Object *nvmrobj;
    198        SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i];
    199
    200        if (!nvslot->gpdev) {
    201            continue;
    202        }
    203        nvmrobj = object_property_get_link(OBJECT(nvslot->gpdev),
    204                                           "nvlink2-mr[0]", NULL);
    205        /* ATSD is pointless without GPU RAM MR so skip those */
    206        if (!nvmrobj) {
    207            continue;
    208        }
    209
    210        ++valid_gpu_num;
    211        memory_region_add_subregion(get_system_memory(), nvslot->gpa,
    212                                    MEMORY_REGION(nvmrobj));
    213
    214        for (j = 0; j < nvslot->linknum; ++j) {
    215            Object *atsdmrobj;
    216
    217            atsdmrobj = object_property_get_link(OBJECT(nvslot->links[j].npdev),
    218                                                 "nvlink2-atsd-mr[0]", NULL);
    219            if (!atsdmrobj) {
    220                continue;
    221            }
    222            memory_region_add_subregion(get_system_memory(),
    223                                        nvslot->links[j].atsd_gpa,
    224                                        MEMORY_REGION(atsdmrobj));
    225        }
    226    }
    227
    228    if (valid_gpu_num) {
    229        return;
    230    }
    231    /* We did not find any interesting GPU */
    232cleanup_exit:
    233    g_free(sphb->nvgpus);
    234    sphb->nvgpus = NULL;
    235}
    236
    237void spapr_phb_nvgpu_free(SpaprPhbState *sphb)
    238{
    239    int i, j;
    240
    241    if (!sphb->nvgpus) {
    242        return;
    243    }
    244
    245    for (i = 0; i < sphb->nvgpus->num; ++i) {
    246        SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i];
    247        Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev),
    248                                                    "nvlink2-mr[0]", NULL);
    249
    250        if (nv_mrobj) {
    251            memory_region_del_subregion(get_system_memory(),
    252                                        MEMORY_REGION(nv_mrobj));
    253        }
    254        for (j = 0; j < nvslot->linknum; ++j) {
    255            PCIDevice *npdev = nvslot->links[j].npdev;
    256            Object *atsd_mrobj;
    257            atsd_mrobj = object_property_get_link(OBJECT(npdev),
    258                                                  "nvlink2-atsd-mr[0]", NULL);
    259            if (atsd_mrobj) {
    260                memory_region_del_subregion(get_system_memory(),
    261                                            MEMORY_REGION(atsd_mrobj));
    262            }
    263        }
    264    }
    265    g_free(sphb->nvgpus);
    266    sphb->nvgpus = NULL;
    267}
    268
    269void spapr_phb_nvgpu_populate_dt(SpaprPhbState *sphb, void *fdt, int bus_off,
    270                                 Error **errp)
    271{
    272    int i, j, atsdnum = 0;
    273    uint64_t atsd[8]; /* The existing limitation of known guests */
    274
    275    if (!sphb->nvgpus) {
    276        return;
    277    }
    278
    279    for (i = 0; (i < sphb->nvgpus->num) && (atsdnum < ARRAY_SIZE(atsd)); ++i) {
    280        SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i];
    281
    282        if (!nvslot->gpdev) {
    283            continue;
    284        }
    285        for (j = 0; j < nvslot->linknum; ++j) {
    286            if (!nvslot->links[j].atsd_gpa) {
    287                continue;
    288            }
    289
    290            if (atsdnum == ARRAY_SIZE(atsd)) {
    291                error_report("Only %"PRIuPTR" ATSD registers supported",
    292                             ARRAY_SIZE(atsd));
    293                break;
    294            }
    295            atsd[atsdnum] = cpu_to_be64(nvslot->links[j].atsd_gpa);
    296            ++atsdnum;
    297        }
    298    }
    299
    300    if (!atsdnum) {
    301        error_setg(errp, "No ATSD registers found");
    302        return;
    303    }
    304
    305    if (!spapr_phb_eeh_available(sphb)) {
    306        /*
    307         * ibm,mmio-atsd contains ATSD registers; these belong to an NPU PHB
    308         * which we do not emulate as a separate device. Instead we put
    309         * ibm,mmio-atsd to the vPHB with GPU and make sure that we do not
    310         * put GPUs from different IOMMU groups to the same vPHB to ensure
    311         * that the guest will use ATSDs from the corresponding NPU.
    312         */
    313        error_setg(errp, "ATSD requires separate vPHB per GPU IOMMU group");
    314        return;
    315    }
    316
    317    _FDT((fdt_setprop(fdt, bus_off, "ibm,mmio-atsd", atsd,
    318                      atsdnum * sizeof(atsd[0]))));
    319}
    320
    321void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt)
    322{
    323    int i, j, linkidx, npuoff;
    324    char *npuname;
    325
    326    if (!sphb->nvgpus) {
    327        return;
    328    }
    329
    330    npuname = g_strdup_printf("npuphb%d", sphb->index);
    331    npuoff = fdt_add_subnode(fdt, 0, npuname);
    332    _FDT(npuoff);
    333    _FDT(fdt_setprop_cell(fdt, npuoff, "#address-cells", 1));
    334    _FDT(fdt_setprop_cell(fdt, npuoff, "#size-cells", 0));
    335    /* Advertise NPU as POWER9 so the guest can enable NPU2 contexts */
    336    _FDT((fdt_setprop_string(fdt, npuoff, "compatible", "ibm,power9-npu")));
    337    g_free(npuname);
    338
    339    for (i = 0, linkidx = 0; i < sphb->nvgpus->num; ++i) {
    340        for (j = 0; j < sphb->nvgpus->slots[i].linknum; ++j) {
    341            char *linkname = g_strdup_printf("link@%d", linkidx);
    342            int off = fdt_add_subnode(fdt, npuoff, linkname);
    343
    344            _FDT(off);
    345            /* _FDT((fdt_setprop_cell(fdt, off, "reg", linkidx))); */
    346            _FDT((fdt_setprop_string(fdt, off, "compatible",
    347                                     "ibm,npu-link")));
    348            _FDT((fdt_setprop_cell(fdt, off, "phandle",
    349                                   PHANDLE_NVLINK(sphb, i, j))));
    350            _FDT((fdt_setprop_cell(fdt, off, "ibm,npu-link-index", linkidx)));
    351            g_free(linkname);
    352            ++linkidx;
    353        }
    354    }
    355
    356    /* Add memory nodes for GPU RAM and mark them unusable */
    357    for (i = 0; i < sphb->nvgpus->num; ++i) {
    358        SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i];
    359        Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev),
    360                                                    "nvlink2-mr[0]",
    361                                                    &error_abort);
    362        uint64_t size = object_property_get_uint(nv_mrobj, "size", NULL);
    363        uint64_t mem_reg[2] = { cpu_to_be64(nvslot->gpa), cpu_to_be64(size) };
    364        char *mem_name = g_strdup_printf("memory@%"PRIx64, nvslot->gpa);
    365        int off = fdt_add_subnode(fdt, 0, mem_name);
    366
    367        _FDT(off);
    368        _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
    369        _FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg))));
    370
    371        spapr_numa_write_associativity_dt(SPAPR_MACHINE(qdev_get_machine()),
    372                                          fdt, off, nvslot->numa_id);
    373
    374        _FDT((fdt_setprop_string(fdt, off, "compatible",
    375                                 "ibm,coherent-device-memory")));
    376
    377        mem_reg[1] = cpu_to_be64(0);
    378        _FDT((fdt_setprop(fdt, off, "linux,usable-memory", mem_reg,
    379                          sizeof(mem_reg))));
    380        _FDT((fdt_setprop_cell(fdt, off, "phandle",
    381                               PHANDLE_GPURAM(sphb, i))));
    382        g_free(mem_name);
    383    }
    384
    385}
    386
    387void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int offset,
    388                                        SpaprPhbState *sphb)
    389{
    390    int i, j;
    391
    392    if (!sphb->nvgpus) {
    393        return;
    394    }
    395
    396    for (i = 0; i < sphb->nvgpus->num; ++i) {
    397        SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i];
    398
    399        /* Skip "slot" without attached GPU */
    400        if (!nvslot->gpdev) {
    401            continue;
    402        }
    403        if (dev == nvslot->gpdev) {
    404            uint32_t npus[nvslot->linknum];
    405
    406            for (j = 0; j < nvslot->linknum; ++j) {
    407                PCIDevice *npdev = nvslot->links[j].npdev;
    408
    409                npus[j] = cpu_to_be32(PHANDLE_PCIDEV(sphb, npdev));
    410            }
    411            _FDT(fdt_setprop(fdt, offset, "ibm,npu", npus,
    412                             j * sizeof(npus[0])));
    413            _FDT((fdt_setprop_cell(fdt, offset, "phandle",
    414                                   PHANDLE_PCIDEV(sphb, dev))));
    415            continue;
    416        }
    417
    418        for (j = 0; j < nvslot->linknum; ++j) {
    419            if (dev != nvslot->links[j].npdev) {
    420                continue;
    421            }
    422
    423            _FDT((fdt_setprop_cell(fdt, offset, "phandle",
    424                                   PHANDLE_PCIDEV(sphb, dev))));
    425            _FDT(fdt_setprop_cell(fdt, offset, "ibm,gpu",
    426                                  PHANDLE_PCIDEV(sphb, nvslot->gpdev)));
    427            _FDT((fdt_setprop_cell(fdt, offset, "ibm,nvlink",
    428                                   PHANDLE_NVLINK(sphb, i, j))));
    429            /*
    430             * If we ever want to emulate GPU RAM at the same location as on
    431             * the host - here is the encoding GPA->TGT:
    432             *
    433             * gta  = ((sphb->nv2_gpa >> 42) & 0x1) << 42;
    434             * gta |= ((sphb->nv2_gpa >> 45) & 0x3) << 43;
    435             * gta |= ((sphb->nv2_gpa >> 49) & 0x3) << 45;
    436             * gta |= sphb->nv2_gpa & ((1UL << 43) - 1);
    437             */
    438            _FDT(fdt_setprop_cell(fdt, offset, "memory-region",
    439                                  PHANDLE_GPURAM(sphb, i)));
    440            _FDT(fdt_setprop_u64(fdt, offset, "ibm,device-tgt-addr",
    441                                 nvslot->tgt));
    442            _FDT(fdt_setprop_cell(fdt, offset, "ibm,nvlink-speed",
    443                                  nvslot->links[j].link_speed));
    444        }
    445    }
    446}