spapr_pci_nvlink2.c (15564B)
1/* 2 * QEMU sPAPR PCI for NVLink2 pass through 3 * 4 * Copyright (c) 2019 Alexey Kardashevskiy, IBM Corporation. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24#include "qemu/osdep.h" 25#include "qapi/error.h" 26#include "qemu-common.h" 27#include "hw/pci/pci.h" 28#include "hw/pci-host/spapr.h" 29#include "hw/ppc/spapr_numa.h" 30#include "qemu/error-report.h" 31#include "hw/ppc/fdt.h" 32#include "hw/pci/pci_bridge.h" 33 34#define PHANDLE_PCIDEV(phb, pdev) (0x12000000 | \ 35 (((phb)->index) << 16) | ((pdev)->devfn)) 36#define PHANDLE_GPURAM(phb, n) (0x110000FF | ((n) << 8) | \ 37 (((phb)->index) << 16)) 38#define PHANDLE_NVLINK(phb, gn, nn) (0x00130000 | (((phb)->index) << 8) | \ 39 ((gn) << 4) | (nn)) 40 41typedef struct SpaprPhbPciNvGpuSlot { 42 uint64_t tgt; 43 uint64_t gpa; 44 unsigned numa_id; 45 PCIDevice *gpdev; 46 int linknum; 47 struct { 48 uint64_t atsd_gpa; 49 PCIDevice *npdev; 50 uint32_t link_speed; 51 } links[NVGPU_MAX_LINKS]; 52} SpaprPhbPciNvGpuSlot; 53 54struct SpaprPhbPciNvGpuConfig { 55 uint64_t nv2_ram_current; 56 uint64_t nv2_atsd_current; 57 int num; /* number of non empty (i.e. tgt!=0) entries in slots[] */ 58 SpaprPhbPciNvGpuSlot slots[NVGPU_MAX_NUM]; 59 Error *err; 60}; 61 62static SpaprPhbPciNvGpuSlot * 63spapr_nvgpu_get_slot(SpaprPhbPciNvGpuConfig *nvgpus, uint64_t tgt) 64{ 65 int i; 66 67 /* Search for partially collected "slot" */ 68 for (i = 0; i < nvgpus->num; ++i) { 69 if (nvgpus->slots[i].tgt == tgt) { 70 return &nvgpus->slots[i]; 71 } 72 } 73 74 if (nvgpus->num == ARRAY_SIZE(nvgpus->slots)) { 75 return NULL; 76 } 77 78 i = nvgpus->num; 79 nvgpus->slots[i].tgt = tgt; 80 ++nvgpus->num; 81 82 return &nvgpus->slots[i]; 83} 84 85static void spapr_pci_collect_nvgpu(SpaprPhbPciNvGpuConfig *nvgpus, 86 PCIDevice *pdev, uint64_t tgt, 87 MemoryRegion *mr, Error **errp) 88{ 89 MachineState *machine = MACHINE(qdev_get_machine()); 90 SpaprMachineState *spapr = SPAPR_MACHINE(machine); 91 SpaprPhbPciNvGpuSlot *nvslot = spapr_nvgpu_get_slot(nvgpus, tgt); 92 93 if (!nvslot) { 94 error_setg(errp, "Found too many GPUs per vPHB"); 95 return; 96 } 97 g_assert(!nvslot->gpdev); 98 nvslot->gpdev = pdev; 99 100 nvslot->gpa = nvgpus->nv2_ram_current; 101 nvgpus->nv2_ram_current += memory_region_size(mr); 102 nvslot->numa_id = spapr->gpu_numa_id; 103 ++spapr->gpu_numa_id; 104} 105 106static void spapr_pci_collect_nvnpu(SpaprPhbPciNvGpuConfig *nvgpus, 107 PCIDevice *pdev, uint64_t tgt, 108 MemoryRegion *mr, Error **errp) 109{ 110 SpaprPhbPciNvGpuSlot *nvslot = spapr_nvgpu_get_slot(nvgpus, tgt); 111 int j; 112 113 if (!nvslot) { 114 error_setg(errp, "Found too many NVLink bridges per vPHB"); 115 return; 116 } 117 118 j = nvslot->linknum; 119 if (j == ARRAY_SIZE(nvslot->links)) { 120 error_setg(errp, "Found too many NVLink bridges per GPU"); 121 return; 122 } 123 ++nvslot->linknum; 124 125 g_assert(!nvslot->links[j].npdev); 126 nvslot->links[j].npdev = pdev; 127 nvslot->links[j].atsd_gpa = nvgpus->nv2_atsd_current; 128 nvgpus->nv2_atsd_current += memory_region_size(mr); 129 nvslot->links[j].link_speed = 130 object_property_get_uint(OBJECT(pdev), "nvlink2-link-speed", NULL); 131} 132 133static void spapr_phb_pci_collect_nvgpu(PCIBus *bus, PCIDevice *pdev, 134 void *opaque) 135{ 136 PCIBus *sec_bus; 137 Object *po = OBJECT(pdev); 138 uint64_t tgt = object_property_get_uint(po, "nvlink2-tgt", NULL); 139 140 if (tgt) { 141 Error *local_err = NULL; 142 SpaprPhbPciNvGpuConfig *nvgpus = opaque; 143 Object *mr_gpu = object_property_get_link(po, "nvlink2-mr[0]", NULL); 144 Object *mr_npu = object_property_get_link(po, "nvlink2-atsd-mr[0]", 145 NULL); 146 147 g_assert(mr_gpu || mr_npu); 148 if (mr_gpu) { 149 spapr_pci_collect_nvgpu(nvgpus, pdev, tgt, MEMORY_REGION(mr_gpu), 150 &local_err); 151 } else { 152 spapr_pci_collect_nvnpu(nvgpus, pdev, tgt, MEMORY_REGION(mr_npu), 153 &local_err); 154 } 155 error_propagate(&nvgpus->err, local_err); 156 } 157 if ((pci_default_read_config(pdev, PCI_HEADER_TYPE, 1) != 158 PCI_HEADER_TYPE_BRIDGE)) { 159 return; 160 } 161 162 sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev)); 163 if (!sec_bus) { 164 return; 165 } 166 167 pci_for_each_device(sec_bus, pci_bus_num(sec_bus), 168 spapr_phb_pci_collect_nvgpu, opaque); 169} 170 171void spapr_phb_nvgpu_setup(SpaprPhbState *sphb, Error **errp) 172{ 173 int i, j, valid_gpu_num; 174 PCIBus *bus; 175 176 /* Search for GPUs and NPUs */ 177 if (!sphb->nv2_gpa_win_addr || !sphb->nv2_atsd_win_addr) { 178 return; 179 } 180 181 sphb->nvgpus = g_new0(SpaprPhbPciNvGpuConfig, 1); 182 sphb->nvgpus->nv2_ram_current = sphb->nv2_gpa_win_addr; 183 sphb->nvgpus->nv2_atsd_current = sphb->nv2_atsd_win_addr; 184 185 bus = PCI_HOST_BRIDGE(sphb)->bus; 186 pci_for_each_device(bus, pci_bus_num(bus), 187 spapr_phb_pci_collect_nvgpu, sphb->nvgpus); 188 189 if (sphb->nvgpus->err) { 190 error_propagate(errp, sphb->nvgpus->err); 191 sphb->nvgpus->err = NULL; 192 goto cleanup_exit; 193 } 194 195 /* Add found GPU RAM and ATSD MRs if found */ 196 for (i = 0, valid_gpu_num = 0; i < sphb->nvgpus->num; ++i) { 197 Object *nvmrobj; 198 SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i]; 199 200 if (!nvslot->gpdev) { 201 continue; 202 } 203 nvmrobj = object_property_get_link(OBJECT(nvslot->gpdev), 204 "nvlink2-mr[0]", NULL); 205 /* ATSD is pointless without GPU RAM MR so skip those */ 206 if (!nvmrobj) { 207 continue; 208 } 209 210 ++valid_gpu_num; 211 memory_region_add_subregion(get_system_memory(), nvslot->gpa, 212 MEMORY_REGION(nvmrobj)); 213 214 for (j = 0; j < nvslot->linknum; ++j) { 215 Object *atsdmrobj; 216 217 atsdmrobj = object_property_get_link(OBJECT(nvslot->links[j].npdev), 218 "nvlink2-atsd-mr[0]", NULL); 219 if (!atsdmrobj) { 220 continue; 221 } 222 memory_region_add_subregion(get_system_memory(), 223 nvslot->links[j].atsd_gpa, 224 MEMORY_REGION(atsdmrobj)); 225 } 226 } 227 228 if (valid_gpu_num) { 229 return; 230 } 231 /* We did not find any interesting GPU */ 232cleanup_exit: 233 g_free(sphb->nvgpus); 234 sphb->nvgpus = NULL; 235} 236 237void spapr_phb_nvgpu_free(SpaprPhbState *sphb) 238{ 239 int i, j; 240 241 if (!sphb->nvgpus) { 242 return; 243 } 244 245 for (i = 0; i < sphb->nvgpus->num; ++i) { 246 SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i]; 247 Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev), 248 "nvlink2-mr[0]", NULL); 249 250 if (nv_mrobj) { 251 memory_region_del_subregion(get_system_memory(), 252 MEMORY_REGION(nv_mrobj)); 253 } 254 for (j = 0; j < nvslot->linknum; ++j) { 255 PCIDevice *npdev = nvslot->links[j].npdev; 256 Object *atsd_mrobj; 257 atsd_mrobj = object_property_get_link(OBJECT(npdev), 258 "nvlink2-atsd-mr[0]", NULL); 259 if (atsd_mrobj) { 260 memory_region_del_subregion(get_system_memory(), 261 MEMORY_REGION(atsd_mrobj)); 262 } 263 } 264 } 265 g_free(sphb->nvgpus); 266 sphb->nvgpus = NULL; 267} 268 269void spapr_phb_nvgpu_populate_dt(SpaprPhbState *sphb, void *fdt, int bus_off, 270 Error **errp) 271{ 272 int i, j, atsdnum = 0; 273 uint64_t atsd[8]; /* The existing limitation of known guests */ 274 275 if (!sphb->nvgpus) { 276 return; 277 } 278 279 for (i = 0; (i < sphb->nvgpus->num) && (atsdnum < ARRAY_SIZE(atsd)); ++i) { 280 SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i]; 281 282 if (!nvslot->gpdev) { 283 continue; 284 } 285 for (j = 0; j < nvslot->linknum; ++j) { 286 if (!nvslot->links[j].atsd_gpa) { 287 continue; 288 } 289 290 if (atsdnum == ARRAY_SIZE(atsd)) { 291 error_report("Only %"PRIuPTR" ATSD registers supported", 292 ARRAY_SIZE(atsd)); 293 break; 294 } 295 atsd[atsdnum] = cpu_to_be64(nvslot->links[j].atsd_gpa); 296 ++atsdnum; 297 } 298 } 299 300 if (!atsdnum) { 301 error_setg(errp, "No ATSD registers found"); 302 return; 303 } 304 305 if (!spapr_phb_eeh_available(sphb)) { 306 /* 307 * ibm,mmio-atsd contains ATSD registers; these belong to an NPU PHB 308 * which we do not emulate as a separate device. Instead we put 309 * ibm,mmio-atsd to the vPHB with GPU and make sure that we do not 310 * put GPUs from different IOMMU groups to the same vPHB to ensure 311 * that the guest will use ATSDs from the corresponding NPU. 312 */ 313 error_setg(errp, "ATSD requires separate vPHB per GPU IOMMU group"); 314 return; 315 } 316 317 _FDT((fdt_setprop(fdt, bus_off, "ibm,mmio-atsd", atsd, 318 atsdnum * sizeof(atsd[0])))); 319} 320 321void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt) 322{ 323 int i, j, linkidx, npuoff; 324 char *npuname; 325 326 if (!sphb->nvgpus) { 327 return; 328 } 329 330 npuname = g_strdup_printf("npuphb%d", sphb->index); 331 npuoff = fdt_add_subnode(fdt, 0, npuname); 332 _FDT(npuoff); 333 _FDT(fdt_setprop_cell(fdt, npuoff, "#address-cells", 1)); 334 _FDT(fdt_setprop_cell(fdt, npuoff, "#size-cells", 0)); 335 /* Advertise NPU as POWER9 so the guest can enable NPU2 contexts */ 336 _FDT((fdt_setprop_string(fdt, npuoff, "compatible", "ibm,power9-npu"))); 337 g_free(npuname); 338 339 for (i = 0, linkidx = 0; i < sphb->nvgpus->num; ++i) { 340 for (j = 0; j < sphb->nvgpus->slots[i].linknum; ++j) { 341 char *linkname = g_strdup_printf("link@%d", linkidx); 342 int off = fdt_add_subnode(fdt, npuoff, linkname); 343 344 _FDT(off); 345 /* _FDT((fdt_setprop_cell(fdt, off, "reg", linkidx))); */ 346 _FDT((fdt_setprop_string(fdt, off, "compatible", 347 "ibm,npu-link"))); 348 _FDT((fdt_setprop_cell(fdt, off, "phandle", 349 PHANDLE_NVLINK(sphb, i, j)))); 350 _FDT((fdt_setprop_cell(fdt, off, "ibm,npu-link-index", linkidx))); 351 g_free(linkname); 352 ++linkidx; 353 } 354 } 355 356 /* Add memory nodes for GPU RAM and mark them unusable */ 357 for (i = 0; i < sphb->nvgpus->num; ++i) { 358 SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i]; 359 Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev), 360 "nvlink2-mr[0]", 361 &error_abort); 362 uint64_t size = object_property_get_uint(nv_mrobj, "size", NULL); 363 uint64_t mem_reg[2] = { cpu_to_be64(nvslot->gpa), cpu_to_be64(size) }; 364 char *mem_name = g_strdup_printf("memory@%"PRIx64, nvslot->gpa); 365 int off = fdt_add_subnode(fdt, 0, mem_name); 366 367 _FDT(off); 368 _FDT((fdt_setprop_string(fdt, off, "device_type", "memory"))); 369 _FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg)))); 370 371 spapr_numa_write_associativity_dt(SPAPR_MACHINE(qdev_get_machine()), 372 fdt, off, nvslot->numa_id); 373 374 _FDT((fdt_setprop_string(fdt, off, "compatible", 375 "ibm,coherent-device-memory"))); 376 377 mem_reg[1] = cpu_to_be64(0); 378 _FDT((fdt_setprop(fdt, off, "linux,usable-memory", mem_reg, 379 sizeof(mem_reg)))); 380 _FDT((fdt_setprop_cell(fdt, off, "phandle", 381 PHANDLE_GPURAM(sphb, i)))); 382 g_free(mem_name); 383 } 384 385} 386 387void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int offset, 388 SpaprPhbState *sphb) 389{ 390 int i, j; 391 392 if (!sphb->nvgpus) { 393 return; 394 } 395 396 for (i = 0; i < sphb->nvgpus->num; ++i) { 397 SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i]; 398 399 /* Skip "slot" without attached GPU */ 400 if (!nvslot->gpdev) { 401 continue; 402 } 403 if (dev == nvslot->gpdev) { 404 uint32_t npus[nvslot->linknum]; 405 406 for (j = 0; j < nvslot->linknum; ++j) { 407 PCIDevice *npdev = nvslot->links[j].npdev; 408 409 npus[j] = cpu_to_be32(PHANDLE_PCIDEV(sphb, npdev)); 410 } 411 _FDT(fdt_setprop(fdt, offset, "ibm,npu", npus, 412 j * sizeof(npus[0]))); 413 _FDT((fdt_setprop_cell(fdt, offset, "phandle", 414 PHANDLE_PCIDEV(sphb, dev)))); 415 continue; 416 } 417 418 for (j = 0; j < nvslot->linknum; ++j) { 419 if (dev != nvslot->links[j].npdev) { 420 continue; 421 } 422 423 _FDT((fdt_setprop_cell(fdt, offset, "phandle", 424 PHANDLE_PCIDEV(sphb, dev)))); 425 _FDT(fdt_setprop_cell(fdt, offset, "ibm,gpu", 426 PHANDLE_PCIDEV(sphb, nvslot->gpdev))); 427 _FDT((fdt_setprop_cell(fdt, offset, "ibm,nvlink", 428 PHANDLE_NVLINK(sphb, i, j)))); 429 /* 430 * If we ever want to emulate GPU RAM at the same location as on 431 * the host - here is the encoding GPA->TGT: 432 * 433 * gta = ((sphb->nv2_gpa >> 42) & 0x1) << 42; 434 * gta |= ((sphb->nv2_gpa >> 45) & 0x3) << 43; 435 * gta |= ((sphb->nv2_gpa >> 49) & 0x3) << 45; 436 * gta |= sphb->nv2_gpa & ((1UL << 43) - 1); 437 */ 438 _FDT(fdt_setprop_cell(fdt, offset, "memory-region", 439 PHANDLE_GPURAM(sphb, i))); 440 _FDT(fdt_setprop_u64(fdt, offset, "ibm,device-tgt-addr", 441 nvslot->tgt)); 442 _FDT(fdt_setprop_cell(fdt, offset, "ibm,nvlink-speed", 443 nvslot->links[j].link_speed)); 444 } 445 } 446}