pci.c - cachepc-qemu - Fork of AMDESE/qemu with changes for cachepc side-channel attack

	cachepc-qemu Fork of AMDESE/qemu with changes for cachepc side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-qemu
	Log \| Files \| Refs \| Submodules \| LICENSE \| sfeed.txt
pci.c (109447B)
      1/*
      2 * vfio based device assignment support
      3 *
      4 * Copyright Red Hat, Inc. 2012
      5 *
      6 * Authors:
      7 *  Alex Williamson <alex.williamson@redhat.com>
      8 *
      9 * This work is licensed under the terms of the GNU GPL, version 2.  See
     10 * the COPYING file in the top-level directory.
     11 *
     12 * Based on qemu-kvm device-assignment:
     13 *  Adapted for KVM by Qumranet.
     14 *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
     15 *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
     16 *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
     17 *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
     18 *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
     19 */
     20
     21#include "qemu/osdep.h"
     22#include <linux/vfio.h>
     23#include <sys/ioctl.h>
     24
     25#include "hw/hw.h"
     26#include "hw/pci/msi.h"
     27#include "hw/pci/msix.h"
     28#include "hw/pci/pci_bridge.h"
     29#include "hw/qdev-properties.h"
     30#include "hw/qdev-properties-system.h"
     31#include "migration/vmstate.h"
     32#include "qemu/error-report.h"
     33#include "qemu/main-loop.h"
     34#include "qemu/module.h"
     35#include "qemu/option.h"
     36#include "qemu/range.h"
     37#include "qemu/units.h"
     38#include "sysemu/kvm.h"
     39#include "sysemu/runstate.h"
     40#include "pci.h"
     41#include "trace.h"
     42#include "qapi/error.h"
     43#include "migration/blocker.h"
     44#include "migration/qemu-file.h"
     45
     46#define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
     47
     48static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
     49static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
     50
     51/*
     52 * Disabling BAR mmaping can be slow, but toggling it around INTx can
     53 * also be a huge overhead.  We try to get the best of both worlds by
     54 * waiting until an interrupt to disable mmaps (subsequent transitions
     55 * to the same state are effectively no overhead).  If the interrupt has
     56 * been serviced and the time gap is long enough, we re-enable mmaps for
     57 * performance.  This works well for things like graphics cards, which
     58 * may not use their interrupt at all and are penalized to an unusable
     59 * level by read/write BAR traps.  Other devices, like NICs, have more
     60 * regular interrupts and see much better latency by staying in non-mmap
     61 * mode.  We therefore set the default mmap_timeout such that a ping
     62 * is just enough to keep the mmap disabled.  Users can experiment with
     63 * other options with the x-intx-mmap-timeout-ms parameter (a value of
     64 * zero disables the timer).
     65 */
     66static void vfio_intx_mmap_enable(void *opaque)
     67{
     68    VFIOPCIDevice *vdev = opaque;
     69
     70    if (vdev->intx.pending) {
     71        timer_mod(vdev->intx.mmap_timer,
     72                       qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
     73        return;
     74    }
     75
     76    vfio_mmap_set_enabled(vdev, true);
     77}
     78
     79static void vfio_intx_interrupt(void *opaque)
     80{
     81    VFIOPCIDevice *vdev = opaque;
     82
     83    if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
     84        return;
     85    }
     86
     87    trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin);
     88
     89    vdev->intx.pending = true;
     90    pci_irq_assert(&vdev->pdev);
     91    vfio_mmap_set_enabled(vdev, false);
     92    if (vdev->intx.mmap_timeout) {
     93        timer_mod(vdev->intx.mmap_timer,
     94                       qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
     95    }
     96}
     97
     98static void vfio_intx_eoi(VFIODevice *vbasedev)
     99{
    100    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
    101
    102    if (!vdev->intx.pending) {
    103        return;
    104    }
    105
    106    trace_vfio_intx_eoi(vbasedev->name);
    107
    108    vdev->intx.pending = false;
    109    pci_irq_deassert(&vdev->pdev);
    110    vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
    111}
    112
    113static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
    114{
    115#ifdef CONFIG_KVM
    116    int irq_fd = event_notifier_get_fd(&vdev->intx.interrupt);
    117
    118    if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
    119        vdev->intx.route.mode != PCI_INTX_ENABLED ||
    120        !kvm_resamplefds_enabled()) {
    121        return;
    122    }
    123
    124    /* Get to a known interrupt state */
    125    qemu_set_fd_handler(irq_fd, NULL, NULL, vdev);
    126    vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
    127    vdev->intx.pending = false;
    128    pci_irq_deassert(&vdev->pdev);
    129
    130    /* Get an eventfd for resample/unmask */
    131    if (event_notifier_init(&vdev->intx.unmask, 0)) {
    132        error_setg(errp, "event_notifier_init failed eoi");
    133        goto fail;
    134    }
    135
    136    if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
    137                                           &vdev->intx.interrupt,
    138                                           &vdev->intx.unmask,
    139                                           vdev->intx.route.irq)) {
    140        error_setg_errno(errp, errno, "failed to setup resample irqfd");
    141        goto fail_irqfd;
    142    }
    143
    144    if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
    145                               VFIO_IRQ_SET_ACTION_UNMASK,
    146                               event_notifier_get_fd(&vdev->intx.unmask),
    147                               errp)) {
    148        goto fail_vfio;
    149    }
    150
    151    /* Let'em rip */
    152    vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
    153
    154    vdev->intx.kvm_accel = true;
    155
    156    trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
    157
    158    return;
    159
    160fail_vfio:
    161    kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
    162                                          vdev->intx.route.irq);
    163fail_irqfd:
    164    event_notifier_cleanup(&vdev->intx.unmask);
    165fail:
    166    qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev);
    167    vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
    168#endif
    169}
    170
    171static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
    172{
    173#ifdef CONFIG_KVM
    174    if (!vdev->intx.kvm_accel) {
    175        return;
    176    }
    177
    178    /*
    179     * Get to a known state, hardware masked, QEMU ready to accept new
    180     * interrupts, QEMU IRQ de-asserted.
    181     */
    182    vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
    183    vdev->intx.pending = false;
    184    pci_irq_deassert(&vdev->pdev);
    185
    186    /* Tell KVM to stop listening for an INTx irqfd */
    187    if (kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
    188                                              vdev->intx.route.irq)) {
    189        error_report("vfio: Error: Failed to disable INTx irqfd: %m");
    190    }
    191
    192    /* We only need to close the eventfd for VFIO to cleanup the kernel side */
    193    event_notifier_cleanup(&vdev->intx.unmask);
    194
    195    /* QEMU starts listening for interrupt events. */
    196    qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt),
    197                        vfio_intx_interrupt, NULL, vdev);
    198
    199    vdev->intx.kvm_accel = false;
    200
    201    /* If we've missed an event, let it re-fire through QEMU */
    202    vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
    203
    204    trace_vfio_intx_disable_kvm(vdev->vbasedev.name);
    205#endif
    206}
    207
    208static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route)
    209{
    210    Error *err = NULL;
    211
    212    trace_vfio_intx_update(vdev->vbasedev.name,
    213                           vdev->intx.route.irq, route->irq);
    214
    215    vfio_intx_disable_kvm(vdev);
    216
    217    vdev->intx.route = *route;
    218
    219    if (route->mode != PCI_INTX_ENABLED) {
    220        return;
    221    }
    222
    223    vfio_intx_enable_kvm(vdev, &err);
    224    if (err) {
    225        warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
    226    }
    227
    228    /* Re-enable the interrupt in cased we missed an EOI */
    229    vfio_intx_eoi(&vdev->vbasedev);
    230}
    231
    232static void vfio_intx_routing_notifier(PCIDevice *pdev)
    233{
    234    VFIOPCIDevice *vdev = VFIO_PCI(pdev);
    235    PCIINTxRoute route;
    236
    237    if (vdev->interrupt != VFIO_INT_INTx) {
    238        return;
    239    }
    240
    241    route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
    242
    243    if (pci_intx_route_changed(&vdev->intx.route, &route)) {
    244        vfio_intx_update(vdev, &route);
    245    }
    246}
    247
    248static void vfio_irqchip_change(Notifier *notify, void *data)
    249{
    250    VFIOPCIDevice *vdev = container_of(notify, VFIOPCIDevice,
    251                                       irqchip_change_notifier);
    252
    253    vfio_intx_update(vdev, &vdev->intx.route);
    254}
    255
    256static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
    257{
    258    uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
    259    Error *err = NULL;
    260    int32_t fd;
    261    int ret;
    262
    263
    264    if (!pin) {
    265        return 0;
    266    }
    267
    268    vfio_disable_interrupts(vdev);
    269
    270    vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
    271    pci_config_set_interrupt_pin(vdev->pdev.config, pin);
    272
    273#ifdef CONFIG_KVM
    274    /*
    275     * Only conditional to avoid generating error messages on platforms
    276     * where we won't actually use the result anyway.
    277     */
    278    if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) {
    279        vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
    280                                                        vdev->intx.pin);
    281    }
    282#endif
    283
    284    ret = event_notifier_init(&vdev->intx.interrupt, 0);
    285    if (ret) {
    286        error_setg_errno(errp, -ret, "event_notifier_init failed");
    287        return ret;
    288    }
    289    fd = event_notifier_get_fd(&vdev->intx.interrupt);
    290    qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
    291
    292    if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
    293                               VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
    294        qemu_set_fd_handler(fd, NULL, NULL, vdev);
    295        event_notifier_cleanup(&vdev->intx.interrupt);
    296        return -errno;
    297    }
    298
    299    vfio_intx_enable_kvm(vdev, &err);
    300    if (err) {
    301        warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
    302    }
    303
    304    vdev->interrupt = VFIO_INT_INTx;
    305
    306    trace_vfio_intx_enable(vdev->vbasedev.name);
    307    return 0;
    308}
    309
    310static void vfio_intx_disable(VFIOPCIDevice *vdev)
    311{
    312    int fd;
    313
    314    timer_del(vdev->intx.mmap_timer);
    315    vfio_intx_disable_kvm(vdev);
    316    vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
    317    vdev->intx.pending = false;
    318    pci_irq_deassert(&vdev->pdev);
    319    vfio_mmap_set_enabled(vdev, true);
    320
    321    fd = event_notifier_get_fd(&vdev->intx.interrupt);
    322    qemu_set_fd_handler(fd, NULL, NULL, vdev);
    323    event_notifier_cleanup(&vdev->intx.interrupt);
    324
    325    vdev->interrupt = VFIO_INT_NONE;
    326
    327    trace_vfio_intx_disable(vdev->vbasedev.name);
    328}
    329
    330/*
    331 * MSI/X
    332 */
    333static void vfio_msi_interrupt(void *opaque)
    334{
    335    VFIOMSIVector *vector = opaque;
    336    VFIOPCIDevice *vdev = vector->vdev;
    337    MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector);
    338    void (*notify)(PCIDevice *dev, unsigned vector);
    339    MSIMessage msg;
    340    int nr = vector - vdev->msi_vectors;
    341
    342    if (!event_notifier_test_and_clear(&vector->interrupt)) {
    343        return;
    344    }
    345
    346    if (vdev->interrupt == VFIO_INT_MSIX) {
    347        get_msg = msix_get_message;
    348        notify = msix_notify;
    349
    350        /* A masked vector firing needs to use the PBA, enable it */
    351        if (msix_is_masked(&vdev->pdev, nr)) {
    352            set_bit(nr, vdev->msix->pending);
    353            memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true);
    354            trace_vfio_msix_pba_enable(vdev->vbasedev.name);
    355        }
    356    } else if (vdev->interrupt == VFIO_INT_MSI) {
    357        get_msg = msi_get_message;
    358        notify = msi_notify;
    359    } else {
    360        abort();
    361    }
    362
    363    msg = get_msg(&vdev->pdev, nr);
    364    trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data);
    365    notify(&vdev->pdev, nr);
    366}
    367
    368static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
    369{
    370    struct vfio_irq_set *irq_set;
    371    int ret = 0, i, argsz;
    372    int32_t *fds;
    373
    374    argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
    375
    376    irq_set = g_malloc0(argsz);
    377    irq_set->argsz = argsz;
    378    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
    379    irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
    380    irq_set->start = 0;
    381    irq_set->count = vdev->nr_vectors;
    382    fds = (int32_t *)&irq_set->data;
    383
    384    for (i = 0; i < vdev->nr_vectors; i++) {
    385        int fd = -1;
    386
    387        /*
    388         * MSI vs MSI-X - The guest has direct access to MSI mask and pending
    389         * bits, therefore we always use the KVM signaling path when setup.
    390         * MSI-X mask and pending bits are emulated, so we want to use the
    391         * KVM signaling path only when configured and unmasked.
    392         */
    393        if (vdev->msi_vectors[i].use) {
    394            if (vdev->msi_vectors[i].virq < 0 ||
    395                (msix && msix_is_masked(&vdev->pdev, i))) {
    396                fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
    397            } else {
    398                fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
    399            }
    400        }
    401
    402        fds[i] = fd;
    403    }
    404
    405    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
    406
    407    g_free(irq_set);
    408
    409    return ret;
    410}
    411
    412static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
    413                                  int vector_n, bool msix)
    414{
    415    int virq;
    416
    417    if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) {
    418        return;
    419    }
    420
    421    if (event_notifier_init(&vector->kvm_interrupt, 0)) {
    422        return;
    423    }
    424
    425    virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev);
    426    if (virq < 0) {
    427        event_notifier_cleanup(&vector->kvm_interrupt);
    428        return;
    429    }
    430
    431    if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
    432                                       NULL, virq) < 0) {
    433        kvm_irqchip_release_virq(kvm_state, virq);
    434        event_notifier_cleanup(&vector->kvm_interrupt);
    435        return;
    436    }
    437
    438    vector->virq = virq;
    439}
    440
    441static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
    442{
    443    kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
    444                                          vector->virq);
    445    kvm_irqchip_release_virq(kvm_state, vector->virq);
    446    vector->virq = -1;
    447    event_notifier_cleanup(&vector->kvm_interrupt);
    448}
    449
    450static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg,
    451                                     PCIDevice *pdev)
    452{
    453    kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev);
    454    kvm_irqchip_commit_routes(kvm_state);
    455}
    456
    457static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
    458                                   MSIMessage *msg, IOHandler *handler)
    459{
    460    VFIOPCIDevice *vdev = VFIO_PCI(pdev);
    461    VFIOMSIVector *vector;
    462    int ret;
    463
    464    trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
    465
    466    vector = &vdev->msi_vectors[nr];
    467
    468    if (!vector->use) {
    469        vector->vdev = vdev;
    470        vector->virq = -1;
    471        if (event_notifier_init(&vector->interrupt, 0)) {
    472            error_report("vfio: Error: event_notifier_init failed");
    473        }
    474        vector->use = true;
    475        msix_vector_use(pdev, nr);
    476    }
    477
    478    qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
    479                        handler, NULL, vector);
    480
    481    /*
    482     * Attempt to enable route through KVM irqchip,
    483     * default to userspace handling if unavailable.
    484     */
    485    if (vector->virq >= 0) {
    486        if (!msg) {
    487            vfio_remove_kvm_msi_virq(vector);
    488        } else {
    489            vfio_update_kvm_msi_virq(vector, *msg, pdev);
    490        }
    491    } else {
    492        if (msg) {
    493            vfio_add_kvm_msi_virq(vdev, vector, nr, true);
    494        }
    495    }
    496
    497    /*
    498     * We don't want to have the host allocate all possible MSI vectors
    499     * for a device if they're not in use, so we shutdown and incrementally
    500     * increase them as needed.
    501     */
    502    if (vdev->nr_vectors < nr + 1) {
    503        vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
    504        vdev->nr_vectors = nr + 1;
    505        ret = vfio_enable_vectors(vdev, true);
    506        if (ret) {
    507            error_report("vfio: failed to enable vectors, %d", ret);
    508        }
    509    } else {
    510        Error *err = NULL;
    511        int32_t fd;
    512
    513        if (vector->virq >= 0) {
    514            fd = event_notifier_get_fd(&vector->kvm_interrupt);
    515        } else {
    516            fd = event_notifier_get_fd(&vector->interrupt);
    517        }
    518
    519        if (vfio_set_irq_signaling(&vdev->vbasedev,
    520                                     VFIO_PCI_MSIX_IRQ_INDEX, nr,
    521                                     VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
    522            error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
    523        }
    524    }
    525
    526    /* Disable PBA emulation when nothing more is pending. */
    527    clear_bit(nr, vdev->msix->pending);
    528    if (find_first_bit(vdev->msix->pending,
    529                       vdev->nr_vectors) == vdev->nr_vectors) {
    530        memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
    531        trace_vfio_msix_pba_disable(vdev->vbasedev.name);
    532    }
    533
    534    return 0;
    535}
    536
    537static int vfio_msix_vector_use(PCIDevice *pdev,
    538                                unsigned int nr, MSIMessage msg)
    539{
    540    return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
    541}
    542
    543static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
    544{
    545    VFIOPCIDevice *vdev = VFIO_PCI(pdev);
    546    VFIOMSIVector *vector = &vdev->msi_vectors[nr];
    547
    548    trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
    549
    550    /*
    551     * There are still old guests that mask and unmask vectors on every
    552     * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
    553     * the KVM setup in place, simply switch VFIO to use the non-bypass
    554     * eventfd.  We'll then fire the interrupt through QEMU and the MSI-X
    555     * core will mask the interrupt and set pending bits, allowing it to
    556     * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
    557     */
    558    if (vector->virq >= 0) {
    559        int32_t fd = event_notifier_get_fd(&vector->interrupt);
    560        Error *err = NULL;
    561
    562        if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr,
    563                                   VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
    564            error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
    565        }
    566    }
    567}
    568
    569static void vfio_msix_enable(VFIOPCIDevice *vdev)
    570{
    571    PCIDevice *pdev = &vdev->pdev;
    572    unsigned int nr, max_vec = 0;
    573
    574    vfio_disable_interrupts(vdev);
    575
    576    vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries);
    577
    578    vdev->interrupt = VFIO_INT_MSIX;
    579
    580    /*
    581     * Some communication channels between VF & PF or PF & fw rely on the
    582     * physical state of the device and expect that enabling MSI-X from the
    583     * guest enables the same on the host.  When our guest is Linux, the
    584     * guest driver call to pci_enable_msix() sets the enabling bit in the
    585     * MSI-X capability, but leaves the vector table masked.  We therefore
    586     * can't rely on a vector_use callback (from request_irq() in the guest)
    587     * to switch the physical device into MSI-X mode because that may come a
    588     * long time after pci_enable_msix().  This code enables vector 0 with
    589     * triggering to userspace, then immediately release the vector, leaving
    590     * the physical device with no vectors enabled, but MSI-X enabled, just
    591     * like the guest view.
    592     * If there are already unmasked vectors (in migration resume phase and
    593     * some guest startups) which will be enabled soon, we can allocate all
    594     * of them here to avoid inefficiently disabling and enabling vectors
    595     * repeatedly later.
    596     */
    597    if (!pdev->msix_function_masked) {
    598        for (nr = 0; nr < msix_nr_vectors_allocated(pdev); nr++) {
    599            if (!msix_is_masked(pdev, nr)) {
    600                max_vec = nr;
    601            }
    602        }
    603    }
    604    vfio_msix_vector_do_use(pdev, max_vec, NULL, NULL);
    605    vfio_msix_vector_release(pdev, max_vec);
    606
    607    if (msix_set_vector_notifiers(pdev, vfio_msix_vector_use,
    608                                  vfio_msix_vector_release, NULL)) {
    609        error_report("vfio: msix_set_vector_notifiers failed");
    610    }
    611
    612    trace_vfio_msix_enable(vdev->vbasedev.name);
    613}
    614
    615static void vfio_msi_enable(VFIOPCIDevice *vdev)
    616{
    617    int ret, i;
    618
    619    vfio_disable_interrupts(vdev);
    620
    621    vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
    622retry:
    623    vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);
    624
    625    for (i = 0; i < vdev->nr_vectors; i++) {
    626        VFIOMSIVector *vector = &vdev->msi_vectors[i];
    627
    628        vector->vdev = vdev;
    629        vector->virq = -1;
    630        vector->use = true;
    631
    632        if (event_notifier_init(&vector->interrupt, 0)) {
    633            error_report("vfio: Error: event_notifier_init failed");
    634        }
    635
    636        qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
    637                            vfio_msi_interrupt, NULL, vector);
    638
    639        /*
    640         * Attempt to enable route through KVM irqchip,
    641         * default to userspace handling if unavailable.
    642         */
    643        vfio_add_kvm_msi_virq(vdev, vector, i, false);
    644    }
    645
    646    /* Set interrupt type prior to possible interrupts */
    647    vdev->interrupt = VFIO_INT_MSI;
    648
    649    ret = vfio_enable_vectors(vdev, false);
    650    if (ret) {
    651        if (ret < 0) {
    652            error_report("vfio: Error: Failed to setup MSI fds: %m");
    653        } else if (ret != vdev->nr_vectors) {
    654            error_report("vfio: Error: Failed to enable %d "
    655                         "MSI vectors, retry with %d", vdev->nr_vectors, ret);
    656        }
    657
    658        for (i = 0; i < vdev->nr_vectors; i++) {
    659            VFIOMSIVector *vector = &vdev->msi_vectors[i];
    660            if (vector->virq >= 0) {
    661                vfio_remove_kvm_msi_virq(vector);
    662            }
    663            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
    664                                NULL, NULL, NULL);
    665            event_notifier_cleanup(&vector->interrupt);
    666        }
    667
    668        g_free(vdev->msi_vectors);
    669        vdev->msi_vectors = NULL;
    670
    671        if (ret > 0 && ret != vdev->nr_vectors) {
    672            vdev->nr_vectors = ret;
    673            goto retry;
    674        }
    675        vdev->nr_vectors = 0;
    676
    677        /*
    678         * Failing to setup MSI doesn't really fall within any specification.
    679         * Let's try leaving interrupts disabled and hope the guest figures
    680         * out to fall back to INTx for this device.
    681         */
    682        error_report("vfio: Error: Failed to enable MSI");
    683        vdev->interrupt = VFIO_INT_NONE;
    684
    685        return;
    686    }
    687
    688    trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
    689}
    690
    691static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
    692{
    693    Error *err = NULL;
    694    int i;
    695
    696    for (i = 0; i < vdev->nr_vectors; i++) {
    697        VFIOMSIVector *vector = &vdev->msi_vectors[i];
    698        if (vdev->msi_vectors[i].use) {
    699            if (vector->virq >= 0) {
    700                vfio_remove_kvm_msi_virq(vector);
    701            }
    702            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
    703                                NULL, NULL, NULL);
    704            event_notifier_cleanup(&vector->interrupt);
    705        }
    706    }
    707
    708    g_free(vdev->msi_vectors);
    709    vdev->msi_vectors = NULL;
    710    vdev->nr_vectors = 0;
    711    vdev->interrupt = VFIO_INT_NONE;
    712
    713    vfio_intx_enable(vdev, &err);
    714    if (err) {
    715        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
    716    }
    717}
    718
    719static void vfio_msix_disable(VFIOPCIDevice *vdev)
    720{
    721    int i;
    722
    723    msix_unset_vector_notifiers(&vdev->pdev);
    724
    725    /*
    726     * MSI-X will only release vectors if MSI-X is still enabled on the
    727     * device, check through the rest and release it ourselves if necessary.
    728     */
    729    for (i = 0; i < vdev->nr_vectors; i++) {
    730        if (vdev->msi_vectors[i].use) {
    731            vfio_msix_vector_release(&vdev->pdev, i);
    732            msix_vector_unuse(&vdev->pdev, i);
    733        }
    734    }
    735
    736    if (vdev->nr_vectors) {
    737        vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
    738    }
    739
    740    vfio_msi_disable_common(vdev);
    741
    742    memset(vdev->msix->pending, 0,
    743           BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
    744
    745    trace_vfio_msix_disable(vdev->vbasedev.name);
    746}
    747
    748static void vfio_msi_disable(VFIOPCIDevice *vdev)
    749{
    750    vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
    751    vfio_msi_disable_common(vdev);
    752
    753    trace_vfio_msi_disable(vdev->vbasedev.name);
    754}
    755
    756static void vfio_update_msi(VFIOPCIDevice *vdev)
    757{
    758    int i;
    759
    760    for (i = 0; i < vdev->nr_vectors; i++) {
    761        VFIOMSIVector *vector = &vdev->msi_vectors[i];
    762        MSIMessage msg;
    763
    764        if (!vector->use || vector->virq < 0) {
    765            continue;
    766        }
    767
    768        msg = msi_get_message(&vdev->pdev, i);
    769        vfio_update_kvm_msi_virq(vector, msg, &vdev->pdev);
    770    }
    771}
    772
    773static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
    774{
    775    struct vfio_region_info *reg_info;
    776    uint64_t size;
    777    off_t off = 0;
    778    ssize_t bytes;
    779
    780    if (vfio_get_region_info(&vdev->vbasedev,
    781                             VFIO_PCI_ROM_REGION_INDEX, &reg_info)) {
    782        error_report("vfio: Error getting ROM info: %m");
    783        return;
    784    }
    785
    786    trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info->size,
    787                            (unsigned long)reg_info->offset,
    788                            (unsigned long)reg_info->flags);
    789
    790    vdev->rom_size = size = reg_info->size;
    791    vdev->rom_offset = reg_info->offset;
    792
    793    g_free(reg_info);
    794
    795    if (!vdev->rom_size) {
    796        vdev->rom_read_failed = true;
    797        error_report("vfio-pci: Cannot read device rom at "
    798                    "%s", vdev->vbasedev.name);
    799        error_printf("Device option ROM contents are probably invalid "
    800                    "(check dmesg).\nSkip option ROM probe with rombar=0, "
    801                    "or load from file with romfile=\n");
    802        return;
    803    }
    804
    805    vdev->rom = g_malloc(size);
    806    memset(vdev->rom, 0xff, size);
    807
    808    while (size) {
    809        bytes = pread(vdev->vbasedev.fd, vdev->rom + off,
    810                      size, vdev->rom_offset + off);
    811        if (bytes == 0) {
    812            break;
    813        } else if (bytes > 0) {
    814            off += bytes;
    815            size -= bytes;
    816        } else {
    817            if (errno == EINTR || errno == EAGAIN) {
    818                continue;
    819            }
    820            error_report("vfio: Error reading device ROM: %m");
    821            break;
    822        }
    823    }
    824
    825    /*
    826     * Test the ROM signature against our device, if the vendor is correct
    827     * but the device ID doesn't match, store the correct device ID and
    828     * recompute the checksum.  Intel IGD devices need this and are known
    829     * to have bogus checksums so we can't simply adjust the checksum.
    830     */
    831    if (pci_get_word(vdev->rom) == 0xaa55 &&
    832        pci_get_word(vdev->rom + 0x18) + 8 < vdev->rom_size &&
    833        !memcmp(vdev->rom + pci_get_word(vdev->rom + 0x18), "PCIR", 4)) {
    834        uint16_t vid, did;
    835
    836        vid = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 4);
    837        did = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6);
    838
    839        if (vid == vdev->vendor_id && did != vdev->device_id) {
    840            int i;
    841            uint8_t csum, *data = vdev->rom;
    842
    843            pci_set_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6,
    844                         vdev->device_id);
    845            data[6] = 0;
    846
    847            for (csum = 0, i = 0; i < vdev->rom_size; i++) {
    848                csum += data[i];
    849            }
    850
    851            data[6] = -csum;
    852        }
    853    }
    854}
    855
    856static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
    857{
    858    VFIOPCIDevice *vdev = opaque;
    859    union {
    860        uint8_t byte;
    861        uint16_t word;
    862        uint32_t dword;
    863        uint64_t qword;
    864    } val;
    865    uint64_t data = 0;
    866
    867    /* Load the ROM lazily when the guest tries to read it */
    868    if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
    869        vfio_pci_load_rom(vdev);
    870    }
    871
    872    memcpy(&val, vdev->rom + addr,
    873           (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
    874
    875    switch (size) {
    876    case 1:
    877        data = val.byte;
    878        break;
    879    case 2:
    880        data = le16_to_cpu(val.word);
    881        break;
    882    case 4:
    883        data = le32_to_cpu(val.dword);
    884        break;
    885    default:
    886        hw_error("vfio: unsupported read size, %d bytes\n", size);
    887        break;
    888    }
    889
    890    trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data);
    891
    892    return data;
    893}
    894
    895static void vfio_rom_write(void *opaque, hwaddr addr,
    896                           uint64_t data, unsigned size)
    897{
    898}
    899
    900static const MemoryRegionOps vfio_rom_ops = {
    901    .read = vfio_rom_read,
    902    .write = vfio_rom_write,
    903    .endianness = DEVICE_LITTLE_ENDIAN,
    904};
    905
    906static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
    907{
    908    uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
    909    off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
    910    DeviceState *dev = DEVICE(vdev);
    911    char *name;
    912    int fd = vdev->vbasedev.fd;
    913
    914    if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
    915        /* Since pci handles romfile, just print a message and return */
    916        if (vfio_opt_rom_in_denylist(vdev) && vdev->pdev.romfile) {
    917            warn_report("Device at %s is known to cause system instability"
    918                        " issues during option rom execution",
    919                        vdev->vbasedev.name);
    920            error_printf("Proceeding anyway since user specified romfile\n");
    921        }
    922        return;
    923    }
    924
    925    /*
    926     * Use the same size ROM BAR as the physical device.  The contents
    927     * will get filled in later when the guest tries to read it.
    928     */
    929    if (pread(fd, &orig, 4, offset) != 4 ||
    930        pwrite(fd, &size, 4, offset) != 4 ||
    931        pread(fd, &size, 4, offset) != 4 ||
    932        pwrite(fd, &orig, 4, offset) != 4) {
    933        error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name);
    934        return;
    935    }
    936
    937    size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
    938
    939    if (!size) {
    940        return;
    941    }
    942
    943    if (vfio_opt_rom_in_denylist(vdev)) {
    944        if (dev->opts && qemu_opt_get(dev->opts, "rombar")) {
    945            warn_report("Device at %s is known to cause system instability"
    946                        " issues during option rom execution",
    947                        vdev->vbasedev.name);
    948            error_printf("Proceeding anyway since user specified"
    949                         " non zero value for rombar\n");
    950        } else {
    951            warn_report("Rom loading for device at %s has been disabled"
    952                        " due to system instability issues",
    953                        vdev->vbasedev.name);
    954            error_printf("Specify rombar=1 or romfile to force\n");
    955            return;
    956        }
    957    }
    958
    959    trace_vfio_pci_size_rom(vdev->vbasedev.name, size);
    960
    961    name = g_strdup_printf("vfio[%s].rom", vdev->vbasedev.name);
    962
    963    memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
    964                          &vfio_rom_ops, vdev, name, size);
    965    g_free(name);
    966
    967    pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
    968                     PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
    969
    970    vdev->rom_read_failed = false;
    971}
    972
    973void vfio_vga_write(void *opaque, hwaddr addr,
    974                           uint64_t data, unsigned size)
    975{
    976    VFIOVGARegion *region = opaque;
    977    VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
    978    union {
    979        uint8_t byte;
    980        uint16_t word;
    981        uint32_t dword;
    982        uint64_t qword;
    983    } buf;
    984    off_t offset = vga->fd_offset + region->offset + addr;
    985
    986    switch (size) {
    987    case 1:
    988        buf.byte = data;
    989        break;
    990    case 2:
    991        buf.word = cpu_to_le16(data);
    992        break;
    993    case 4:
    994        buf.dword = cpu_to_le32(data);
    995        break;
    996    default:
    997        hw_error("vfio: unsupported write size, %d bytes", size);
    998        break;
    999    }
   1000
   1001    if (pwrite(vga->fd, &buf, size, offset) != size) {
   1002        error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
   1003                     __func__, region->offset + addr, data, size);
   1004    }
   1005
   1006    trace_vfio_vga_write(region->offset + addr, data, size);
   1007}
   1008
   1009uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
   1010{
   1011    VFIOVGARegion *region = opaque;
   1012    VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
   1013    union {
   1014        uint8_t byte;
   1015        uint16_t word;
   1016        uint32_t dword;
   1017        uint64_t qword;
   1018    } buf;
   1019    uint64_t data = 0;
   1020    off_t offset = vga->fd_offset + region->offset + addr;
   1021
   1022    if (pread(vga->fd, &buf, size, offset) != size) {
   1023        error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
   1024                     __func__, region->offset + addr, size);
   1025        return (uint64_t)-1;
   1026    }
   1027
   1028    switch (size) {
   1029    case 1:
   1030        data = buf.byte;
   1031        break;
   1032    case 2:
   1033        data = le16_to_cpu(buf.word);
   1034        break;
   1035    case 4:
   1036        data = le32_to_cpu(buf.dword);
   1037        break;
   1038    default:
   1039        hw_error("vfio: unsupported read size, %d bytes", size);
   1040        break;
   1041    }
   1042
   1043    trace_vfio_vga_read(region->offset + addr, size, data);
   1044
   1045    return data;
   1046}
   1047
   1048static const MemoryRegionOps vfio_vga_ops = {
   1049    .read = vfio_vga_read,
   1050    .write = vfio_vga_write,
   1051    .endianness = DEVICE_LITTLE_ENDIAN,
   1052};
   1053
   1054/*
   1055 * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page
   1056 * size if the BAR is in an exclusive page in host so that we could map
   1057 * this BAR to guest. But this sub-page BAR may not occupy an exclusive
   1058 * page in guest. So we should set the priority of the expanded memory
   1059 * region to zero in case of overlap with BARs which share the same page
   1060 * with the sub-page BAR in guest. Besides, we should also recover the
   1061 * size of this sub-page BAR when its base address is changed in guest
   1062 * and not page aligned any more.
   1063 */
   1064static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
   1065{
   1066    VFIOPCIDevice *vdev = VFIO_PCI(pdev);
   1067    VFIORegion *region = &vdev->bars[bar].region;
   1068    MemoryRegion *mmap_mr, *region_mr, *base_mr;
   1069    PCIIORegion *r;
   1070    pcibus_t bar_addr;
   1071    uint64_t size = region->size;
   1072
   1073    /* Make sure that the whole region is allowed to be mmapped */
   1074    if (region->nr_mmaps != 1 || !region->mmaps[0].mmap ||
   1075        region->mmaps[0].size != region->size) {
   1076        return;
   1077    }
   1078
   1079    r = &pdev->io_regions[bar];
   1080    bar_addr = r->addr;
   1081    base_mr = vdev->bars[bar].mr;
   1082    region_mr = region->mem;
   1083    mmap_mr = &region->mmaps[0].mem;
   1084
   1085    /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */
   1086    if (bar_addr != PCI_BAR_UNMAPPED &&
   1087        !(bar_addr & ~qemu_real_host_page_mask)) {
   1088        size = qemu_real_host_page_size;
   1089    }
   1090
   1091    memory_region_transaction_begin();
   1092
   1093    if (vdev->bars[bar].size < size) {
   1094        memory_region_set_size(base_mr, size);
   1095    }
   1096    memory_region_set_size(region_mr, size);
   1097    memory_region_set_size(mmap_mr, size);
   1098    if (size != vdev->bars[bar].size && memory_region_is_mapped(base_mr)) {
   1099        memory_region_del_subregion(r->address_space, base_mr);
   1100        memory_region_add_subregion_overlap(r->address_space,
   1101                                            bar_addr, base_mr, 0);
   1102    }
   1103
   1104    memory_region_transaction_commit();
   1105}
   1106
   1107/*
   1108 * PCI config space
   1109 */
   1110uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
   1111{
   1112    VFIOPCIDevice *vdev = VFIO_PCI(pdev);
   1113    uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
   1114
   1115    memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
   1116    emu_bits = le32_to_cpu(emu_bits);
   1117
   1118    if (emu_bits) {
   1119        emu_val = pci_default_read_config(pdev, addr, len);
   1120    }
   1121
   1122    if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
   1123        ssize_t ret;
   1124
   1125        ret = pread(vdev->vbasedev.fd, &phys_val, len,
   1126                    vdev->config_offset + addr);
   1127        if (ret != len) {
   1128            error_report("%s(%s, 0x%x, 0x%x) failed: %m",
   1129                         __func__, vdev->vbasedev.name, addr, len);
   1130            return -errno;
   1131        }
   1132        phys_val = le32_to_cpu(phys_val);
   1133    }
   1134
   1135    val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
   1136
   1137    trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val);
   1138
   1139    return val;
   1140}
   1141
   1142void vfio_pci_write_config(PCIDevice *pdev,
   1143                           uint32_t addr, uint32_t val, int len)
   1144{
   1145    VFIOPCIDevice *vdev = VFIO_PCI(pdev);
   1146    uint32_t val_le = cpu_to_le32(val);
   1147
   1148    trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
   1149
   1150    /* Write everything to VFIO, let it filter out what we can't write */
   1151    if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr)
   1152                != len) {
   1153        error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m",
   1154                     __func__, vdev->vbasedev.name, addr, val, len);
   1155    }
   1156
   1157    /* MSI/MSI-X Enabling/Disabling */
   1158    if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
   1159        ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
   1160        int is_enabled, was_enabled = msi_enabled(pdev);
   1161
   1162        pci_default_write_config(pdev, addr, val, len);
   1163
   1164        is_enabled = msi_enabled(pdev);
   1165
   1166        if (!was_enabled) {
   1167            if (is_enabled) {
   1168                vfio_msi_enable(vdev);
   1169            }
   1170        } else {
   1171            if (!is_enabled) {
   1172                vfio_msi_disable(vdev);
   1173            } else {
   1174                vfio_update_msi(vdev);
   1175            }
   1176        }
   1177    } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
   1178        ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
   1179        int is_enabled, was_enabled = msix_enabled(pdev);
   1180
   1181        pci_default_write_config(pdev, addr, val, len);
   1182
   1183        is_enabled = msix_enabled(pdev);
   1184
   1185        if (!was_enabled && is_enabled) {
   1186            vfio_msix_enable(vdev);
   1187        } else if (was_enabled && !is_enabled) {
   1188            vfio_msix_disable(vdev);
   1189        }
   1190    } else if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_0, 24) ||
   1191        range_covers_byte(addr, len, PCI_COMMAND)) {
   1192        pcibus_t old_addr[PCI_NUM_REGIONS - 1];
   1193        int bar;
   1194
   1195        for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
   1196            old_addr[bar] = pdev->io_regions[bar].addr;
   1197        }
   1198
   1199        pci_default_write_config(pdev, addr, val, len);
   1200
   1201        for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
   1202            if (old_addr[bar] != pdev->io_regions[bar].addr &&
   1203                vdev->bars[bar].region.size > 0 &&
   1204                vdev->bars[bar].region.size < qemu_real_host_page_size) {
   1205                vfio_sub_page_bar_update_mapping(pdev, bar);
   1206            }
   1207        }
   1208    } else {
   1209        /* Write everything to QEMU to keep emulated bits correct */
   1210        pci_default_write_config(pdev, addr, val, len);
   1211    }
   1212}
   1213
   1214/*
   1215 * Interrupt setup
   1216 */
   1217static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
   1218{
   1219    /*
   1220     * More complicated than it looks.  Disabling MSI/X transitions the
   1221     * device to INTx mode (if supported).  Therefore we need to first
   1222     * disable MSI/X and then cleanup by disabling INTx.
   1223     */
   1224    if (vdev->interrupt == VFIO_INT_MSIX) {
   1225        vfio_msix_disable(vdev);
   1226    } else if (vdev->interrupt == VFIO_INT_MSI) {
   1227        vfio_msi_disable(vdev);
   1228    }
   1229
   1230    if (vdev->interrupt == VFIO_INT_INTx) {
   1231        vfio_intx_disable(vdev);
   1232    }
   1233}
   1234
   1235static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
   1236{
   1237    uint16_t ctrl;
   1238    bool msi_64bit, msi_maskbit;
   1239    int ret, entries;
   1240    Error *err = NULL;
   1241
   1242    if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
   1243              vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
   1244        error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
   1245        return -errno;
   1246    }
   1247    ctrl = le16_to_cpu(ctrl);
   1248
   1249    msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
   1250    msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
   1251    entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
   1252
   1253    trace_vfio_msi_setup(vdev->vbasedev.name, pos);
   1254
   1255    ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit, &err);
   1256    if (ret < 0) {
   1257        if (ret == -ENOTSUP) {
   1258            return 0;
   1259        }
   1260        error_propagate_prepend(errp, err, "msi_init failed: ");
   1261        return ret;
   1262    }
   1263    vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
   1264
   1265    return 0;
   1266}
   1267
   1268static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
   1269{
   1270    off_t start, end;
   1271    VFIORegion *region = &vdev->bars[vdev->msix->table_bar].region;
   1272
   1273    /*
   1274     * If the host driver allows mapping of a MSIX data, we are going to
   1275     * do map the entire BAR and emulate MSIX table on top of that.
   1276     */
   1277    if (vfio_has_region_cap(&vdev->vbasedev, region->nr,
   1278                            VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) {
   1279        return;
   1280    }
   1281
   1282    /*
   1283     * We expect to find a single mmap covering the whole BAR, anything else
   1284     * means it's either unsupported or already setup.
   1285     */
   1286    if (region->nr_mmaps != 1 || region->mmaps[0].offset ||
   1287        region->size != region->mmaps[0].size) {
   1288        return;
   1289    }
   1290
   1291    /* MSI-X table start and end aligned to host page size */
   1292    start = vdev->msix->table_offset & qemu_real_host_page_mask;
   1293    end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
   1294                               (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
   1295
   1296    /*
   1297     * Does the MSI-X table cover the beginning of the BAR?  The whole BAR?
   1298     * NB - Host page size is necessarily a power of two and so is the PCI
   1299     * BAR (not counting EA yet), therefore if we have host page aligned
   1300     * @start and @end, then any remainder of the BAR before or after those
   1301     * must be at least host page sized and therefore mmap'able.
   1302     */
   1303    if (!start) {
   1304        if (end >= region->size) {
   1305            region->nr_mmaps = 0;
   1306            g_free(region->mmaps);
   1307            region->mmaps = NULL;
   1308            trace_vfio_msix_fixup(vdev->vbasedev.name,
   1309                                  vdev->msix->table_bar, 0, 0);
   1310        } else {
   1311            region->mmaps[0].offset = end;
   1312            region->mmaps[0].size = region->size - end;
   1313            trace_vfio_msix_fixup(vdev->vbasedev.name,
   1314                              vdev->msix->table_bar, region->mmaps[0].offset,
   1315                              region->mmaps[0].offset + region->mmaps[0].size);
   1316        }
   1317
   1318    /* Maybe it's aligned at the end of the BAR */
   1319    } else if (end >= region->size) {
   1320        region->mmaps[0].size = start;
   1321        trace_vfio_msix_fixup(vdev->vbasedev.name,
   1322                              vdev->msix->table_bar, region->mmaps[0].offset,
   1323                              region->mmaps[0].offset + region->mmaps[0].size);
   1324
   1325    /* Otherwise it must split the BAR */
   1326    } else {
   1327        region->nr_mmaps = 2;
   1328        region->mmaps = g_renew(VFIOMmap, region->mmaps, 2);
   1329
   1330        memcpy(&region->mmaps[1], &region->mmaps[0], sizeof(VFIOMmap));
   1331
   1332        region->mmaps[0].size = start;
   1333        trace_vfio_msix_fixup(vdev->vbasedev.name,
   1334                              vdev->msix->table_bar, region->mmaps[0].offset,
   1335                              region->mmaps[0].offset + region->mmaps[0].size);
   1336
   1337        region->mmaps[1].offset = end;
   1338        region->mmaps[1].size = region->size - end;
   1339        trace_vfio_msix_fixup(vdev->vbasedev.name,
   1340                              vdev->msix->table_bar, region->mmaps[1].offset,
   1341                              region->mmaps[1].offset + region->mmaps[1].size);
   1342    }
   1343}
   1344
   1345static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
   1346{
   1347    int target_bar = -1;
   1348    size_t msix_sz;
   1349
   1350    if (!vdev->msix || vdev->msix_relo == OFF_AUTOPCIBAR_OFF) {
   1351        return;
   1352    }
   1353
   1354    /* The actual minimum size of MSI-X structures */
   1355    msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) +
   1356              (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8);
   1357    /* Round up to host pages, we don't want to share a page */
   1358    msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz);
   1359    /* PCI BARs must be a power of 2 */
   1360    msix_sz = pow2ceil(msix_sz);
   1361
   1362    if (vdev->msix_relo == OFF_AUTOPCIBAR_AUTO) {
   1363        /*
   1364         * TODO: Lookup table for known devices.
   1365         *
   1366         * Logically we might use an algorithm here to select the BAR adding
   1367         * the least additional MMIO space, but we cannot programmatically
   1368         * predict the driver dependency on BAR ordering or sizing, therefore
   1369         * 'auto' becomes a lookup for combinations reported to work.
   1370         */
   1371        if (target_bar < 0) {
   1372            error_setg(errp, "No automatic MSI-X relocation available for "
   1373                       "device %04x:%04x", vdev->vendor_id, vdev->device_id);
   1374            return;
   1375        }
   1376    } else {
   1377        target_bar = (int)(vdev->msix_relo - OFF_AUTOPCIBAR_BAR0);
   1378    }
   1379
   1380    /* I/O port BARs cannot host MSI-X structures */
   1381    if (vdev->bars[target_bar].ioport) {
   1382        error_setg(errp, "Invalid MSI-X relocation BAR %d, "
   1383                   "I/O port BAR", target_bar);
   1384        return;
   1385    }
   1386
   1387    /* Cannot use a BAR in the "shadow" of a 64-bit BAR */
   1388    if (!vdev->bars[target_bar].size &&
   1389         target_bar > 0 && vdev->bars[target_bar - 1].mem64) {
   1390        error_setg(errp, "Invalid MSI-X relocation BAR %d, "
   1391                   "consumed by 64-bit BAR %d", target_bar, target_bar - 1);
   1392        return;
   1393    }
   1394
   1395    /* 2GB max size for 32-bit BARs, cannot double if already > 1G */
   1396    if (vdev->bars[target_bar].size > 1 * GiB &&
   1397        !vdev->bars[target_bar].mem64) {
   1398        error_setg(errp, "Invalid MSI-X relocation BAR %d, "
   1399                   "no space to extend 32-bit BAR", target_bar);
   1400        return;
   1401    }
   1402
   1403    /*
   1404     * If adding a new BAR, test if we can make it 64bit.  We make it
   1405     * prefetchable since QEMU MSI-X emulation has no read side effects
   1406     * and doing so makes mapping more flexible.
   1407     */
   1408    if (!vdev->bars[target_bar].size) {
   1409        if (target_bar < (PCI_ROM_SLOT - 1) &&
   1410            !vdev->bars[target_bar + 1].size) {
   1411            vdev->bars[target_bar].mem64 = true;
   1412            vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64;
   1413        }
   1414        vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH;
   1415        vdev->bars[target_bar].size = msix_sz;
   1416        vdev->msix->table_offset = 0;
   1417    } else {
   1418        vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2,
   1419                                          msix_sz * 2);
   1420        /*
   1421         * Due to above size calc, MSI-X always starts halfway into the BAR,
   1422         * which will always be a separate host page.
   1423         */
   1424        vdev->msix->table_offset = vdev->bars[target_bar].size / 2;
   1425    }
   1426
   1427    vdev->msix->table_bar = target_bar;
   1428    vdev->msix->pba_bar = target_bar;
   1429    /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */
   1430    vdev->msix->pba_offset = vdev->msix->table_offset +
   1431                                  (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE);
   1432
   1433    trace_vfio_msix_relo(vdev->vbasedev.name,
   1434                         vdev->msix->table_bar, vdev->msix->table_offset);
   1435}
   1436
   1437/*
   1438 * We don't have any control over how pci_add_capability() inserts
   1439 * capabilities into the chain.  In order to setup MSI-X we need a
   1440 * MemoryRegion for the BAR.  In order to setup the BAR and not
   1441 * attempt to mmap the MSI-X table area, which VFIO won't allow, we
   1442 * need to first look for where the MSI-X table lives.  So we
   1443 * unfortunately split MSI-X setup across two functions.
   1444 */
   1445static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
   1446{
   1447    uint8_t pos;
   1448    uint16_t ctrl;
   1449    uint32_t table, pba;
   1450    int fd = vdev->vbasedev.fd;
   1451    VFIOMSIXInfo *msix;
   1452
   1453    pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
   1454    if (!pos) {
   1455        return;
   1456    }
   1457
   1458    if (pread(fd, &ctrl, sizeof(ctrl),
   1459              vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) {
   1460        error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS");
   1461        return;
   1462    }
   1463
   1464    if (pread(fd, &table, sizeof(table),
   1465              vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
   1466        error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE");
   1467        return;
   1468    }
   1469
   1470    if (pread(fd, &pba, sizeof(pba),
   1471              vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
   1472        error_setg_errno(errp, errno, "failed to read PCI MSIX PBA");
   1473        return;
   1474    }
   1475
   1476    ctrl = le16_to_cpu(ctrl);
   1477    table = le32_to_cpu(table);
   1478    pba = le32_to_cpu(pba);
   1479
   1480    msix = g_malloc0(sizeof(*msix));
   1481    msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
   1482    msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
   1483    msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
   1484    msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
   1485    msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
   1486
   1487    /*
   1488     * Test the size of the pba_offset variable and catch if it extends outside
   1489     * of the specified BAR. If it is the case, we need to apply a hardware
   1490     * specific quirk if the device is known or we have a broken configuration.
   1491     */
   1492    if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) {
   1493        /*
   1494         * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5
   1495         * adapters. The T5 hardware returns an incorrect value of 0x8000 for
   1496         * the VF PBA offset while the BAR itself is only 8k. The correct value
   1497         * is 0x1000, so we hard code that here.
   1498         */
   1499        if (vdev->vendor_id == PCI_VENDOR_ID_CHELSIO &&
   1500            (vdev->device_id & 0xff00) == 0x5800) {
   1501            msix->pba_offset = 0x1000;
   1502        /*
   1503         * BAIDU KUNLUN Virtual Function devices for KUNLUN AI processor
   1504         * return an incorrect value of 0x460000 for the VF PBA offset while
   1505         * the BAR itself is only 0x10000.  The correct value is 0xb400.
   1506         */
   1507        } else if (vfio_pci_is(vdev, PCI_VENDOR_ID_BAIDU,
   1508                               PCI_DEVICE_ID_KUNLUN_VF)) {
   1509            msix->pba_offset = 0xb400;
   1510        } else if (vdev->msix_relo == OFF_AUTOPCIBAR_OFF) {
   1511            error_setg(errp, "hardware reports invalid configuration, "
   1512                       "MSIX PBA outside of specified BAR");
   1513            g_free(msix);
   1514            return;
   1515        }
   1516    }
   1517
   1518    trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
   1519                                msix->table_offset, msix->entries);
   1520    vdev->msix = msix;
   1521
   1522    vfio_pci_fixup_msix_region(vdev);
   1523
   1524    vfio_pci_relocate_msix(vdev, errp);
   1525}
   1526
   1527static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
   1528{
   1529    int ret;
   1530    Error *err = NULL;
   1531
   1532    vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) *
   1533                                    sizeof(unsigned long));
   1534    ret = msix_init(&vdev->pdev, vdev->msix->entries,
   1535                    vdev->bars[vdev->msix->table_bar].mr,
   1536                    vdev->msix->table_bar, vdev->msix->table_offset,
   1537                    vdev->bars[vdev->msix->pba_bar].mr,
   1538                    vdev->msix->pba_bar, vdev->msix->pba_offset, pos,
   1539                    &err);
   1540    if (ret < 0) {
   1541        if (ret == -ENOTSUP) {
   1542            warn_report_err(err);
   1543            return 0;
   1544        }
   1545
   1546        error_propagate(errp, err);
   1547        return ret;
   1548    }
   1549
   1550    /*
   1551     * The PCI spec suggests that devices provide additional alignment for
   1552     * MSI-X structures and avoid overlapping non-MSI-X related registers.
   1553     * For an assigned device, this hopefully means that emulation of MSI-X
   1554     * structures does not affect the performance of the device.  If devices
   1555     * fail to provide that alignment, a significant performance penalty may
   1556     * result, for instance Mellanox MT27500 VFs:
   1557     * http://www.spinics.net/lists/kvm/msg125881.html
   1558     *
   1559     * The PBA is simply not that important for such a serious regression and
   1560     * most drivers do not appear to look at it.  The solution for this is to
   1561     * disable the PBA MemoryRegion unless it's being used.  We disable it
   1562     * here and only enable it if a masked vector fires through QEMU.  As the
   1563     * vector-use notifier is called, which occurs on unmask, we test whether
   1564     * PBA emulation is needed and again disable if not.
   1565     */
   1566    memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
   1567
   1568    /*
   1569     * The emulated machine may provide a paravirt interface for MSIX setup
   1570     * so it is not strictly necessary to emulate MSIX here. This becomes
   1571     * helpful when frequently accessed MMIO registers are located in
   1572     * subpages adjacent to the MSIX table but the MSIX data containing page
   1573     * cannot be mapped because of a host page size bigger than the MSIX table
   1574     * alignment.
   1575     */
   1576    if (object_property_get_bool(OBJECT(qdev_get_machine()),
   1577                                 "vfio-no-msix-emulation", NULL)) {
   1578        memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
   1579    }
   1580
   1581    return 0;
   1582}
   1583
   1584static void vfio_teardown_msi(VFIOPCIDevice *vdev)
   1585{
   1586    msi_uninit(&vdev->pdev);
   1587
   1588    if (vdev->msix) {
   1589        msix_uninit(&vdev->pdev,
   1590                    vdev->bars[vdev->msix->table_bar].mr,
   1591                    vdev->bars[vdev->msix->pba_bar].mr);
   1592        g_free(vdev->msix->pending);
   1593    }
   1594}
   1595
   1596/*
   1597 * Resource setup
   1598 */
   1599static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
   1600{
   1601    int i;
   1602
   1603    for (i = 0; i < PCI_ROM_SLOT; i++) {
   1604        vfio_region_mmaps_set_enabled(&vdev->bars[i].region, enabled);
   1605    }
   1606}
   1607
   1608static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
   1609{
   1610    VFIOBAR *bar = &vdev->bars[nr];
   1611
   1612    uint32_t pci_bar;
   1613    int ret;
   1614
   1615    /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
   1616    if (!bar->region.size) {
   1617        return;
   1618    }
   1619
   1620    /* Determine what type of BAR this is for registration */
   1621    ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar),
   1622                vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
   1623    if (ret != sizeof(pci_bar)) {
   1624        error_report("vfio: Failed to read BAR %d (%m)", nr);
   1625        return;
   1626    }
   1627
   1628    pci_bar = le32_to_cpu(pci_bar);
   1629    bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
   1630    bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
   1631    bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
   1632                                         ~PCI_BASE_ADDRESS_MEM_MASK);
   1633    bar->size = bar->region.size;
   1634}
   1635
   1636static void vfio_bars_prepare(VFIOPCIDevice *vdev)
   1637{
   1638    int i;
   1639
   1640    for (i = 0; i < PCI_ROM_SLOT; i++) {
   1641        vfio_bar_prepare(vdev, i);
   1642    }
   1643}
   1644
   1645static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
   1646{
   1647    VFIOBAR *bar = &vdev->bars[nr];
   1648    char *name;
   1649
   1650    if (!bar->size) {
   1651        return;
   1652    }
   1653
   1654    bar->mr = g_new0(MemoryRegion, 1);
   1655    name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr);
   1656    memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size);
   1657    g_free(name);
   1658
   1659    if (bar->region.size) {
   1660        memory_region_add_subregion(bar->mr, 0, bar->region.mem);
   1661
   1662        if (vfio_region_mmap(&bar->region)) {
   1663            error_report("Failed to mmap %s BAR %d. Performance may be slow",
   1664                         vdev->vbasedev.name, nr);
   1665        }
   1666    }
   1667
   1668    pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr);
   1669}
   1670
   1671static void vfio_bars_register(VFIOPCIDevice *vdev)
   1672{
   1673    int i;
   1674
   1675    for (i = 0; i < PCI_ROM_SLOT; i++) {
   1676        vfio_bar_register(vdev, i);
   1677    }
   1678}
   1679
   1680static void vfio_bars_exit(VFIOPCIDevice *vdev)
   1681{
   1682    int i;
   1683
   1684    for (i = 0; i < PCI_ROM_SLOT; i++) {
   1685        VFIOBAR *bar = &vdev->bars[i];
   1686
   1687        vfio_bar_quirk_exit(vdev, i);
   1688        vfio_region_exit(&bar->region);
   1689        if (bar->region.size) {
   1690            memory_region_del_subregion(bar->mr, bar->region.mem);
   1691        }
   1692    }
   1693
   1694    if (vdev->vga) {
   1695        pci_unregister_vga(&vdev->pdev);
   1696        vfio_vga_quirk_exit(vdev);
   1697    }
   1698}
   1699
   1700static void vfio_bars_finalize(VFIOPCIDevice *vdev)
   1701{
   1702    int i;
   1703
   1704    for (i = 0; i < PCI_ROM_SLOT; i++) {
   1705        VFIOBAR *bar = &vdev->bars[i];
   1706
   1707        vfio_bar_quirk_finalize(vdev, i);
   1708        vfio_region_finalize(&bar->region);
   1709        if (bar->size) {
   1710            object_unparent(OBJECT(bar->mr));
   1711            g_free(bar->mr);
   1712        }
   1713    }
   1714
   1715    if (vdev->vga) {
   1716        vfio_vga_quirk_finalize(vdev);
   1717        for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
   1718            object_unparent(OBJECT(&vdev->vga->region[i].mem));
   1719        }
   1720        g_free(vdev->vga);
   1721    }
   1722}
   1723
   1724/*
   1725 * General setup
   1726 */
   1727static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
   1728{
   1729    uint8_t tmp;
   1730    uint16_t next = PCI_CONFIG_SPACE_SIZE;
   1731
   1732    for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
   1733         tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]) {
   1734        if (tmp > pos && tmp < next) {
   1735            next = tmp;
   1736        }
   1737    }
   1738
   1739    return next - pos;
   1740}
   1741
   1742
   1743static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos)
   1744{
   1745    uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
   1746
   1747    for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
   1748        tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
   1749        if (tmp > pos && tmp < next) {
   1750            next = tmp;
   1751        }
   1752    }
   1753
   1754    return next - pos;
   1755}
   1756
   1757static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
   1758{
   1759    pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
   1760}
   1761
   1762static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos,
   1763                                   uint16_t val, uint16_t mask)
   1764{
   1765    vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
   1766    vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
   1767    vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
   1768}
   1769
   1770static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
   1771{
   1772    pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
   1773}
   1774
   1775static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos,
   1776                                   uint32_t val, uint32_t mask)
   1777{
   1778    vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
   1779    vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
   1780    vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
   1781}
   1782
   1783static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size,
   1784                               Error **errp)
   1785{
   1786    uint16_t flags;
   1787    uint8_t type;
   1788
   1789    flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
   1790    type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
   1791
   1792    if (type != PCI_EXP_TYPE_ENDPOINT &&
   1793        type != PCI_EXP_TYPE_LEG_END &&
   1794        type != PCI_EXP_TYPE_RC_END) {
   1795
   1796        error_setg(errp, "assignment of PCIe type 0x%x "
   1797                   "devices is not currently supported", type);
   1798        return -EINVAL;
   1799    }
   1800
   1801    if (!pci_bus_is_express(pci_get_bus(&vdev->pdev))) {
   1802        PCIBus *bus = pci_get_bus(&vdev->pdev);
   1803        PCIDevice *bridge;
   1804
   1805        /*
   1806         * Traditionally PCI device assignment exposes the PCIe capability
   1807         * as-is on non-express buses.  The reason being that some drivers
   1808         * simply assume that it's there, for example tg3.  However when
   1809         * we're running on a native PCIe machine type, like Q35, we need
   1810         * to hide the PCIe capability.  The reason for this is twofold;
   1811         * first Windows guests get a Code 10 error when the PCIe capability
   1812         * is exposed in this configuration.  Therefore express devices won't
   1813         * work at all unless they're attached to express buses in the VM.
   1814         * Second, a native PCIe machine introduces the possibility of fine
   1815         * granularity IOMMUs supporting both translation and isolation.
   1816         * Guest code to discover the IOMMU visibility of a device, such as
   1817         * IOMMU grouping code on Linux, is very aware of device types and
   1818         * valid transitions between bus types.  An express device on a non-
   1819         * express bus is not a valid combination on bare metal systems.
   1820         *
   1821         * Drivers that require a PCIe capability to make the device
   1822         * functional are simply going to need to have their devices placed
   1823         * on a PCIe bus in the VM.
   1824         */
   1825        while (!pci_bus_is_root(bus)) {
   1826            bridge = pci_bridge_get_device(bus);
   1827            bus = pci_get_bus(bridge);
   1828        }
   1829
   1830        if (pci_bus_is_express(bus)) {
   1831            return 0;
   1832        }
   1833
   1834    } else if (pci_bus_is_root(pci_get_bus(&vdev->pdev))) {
   1835        /*
   1836         * On a Root Complex bus Endpoints become Root Complex Integrated
   1837         * Endpoints, which changes the type and clears the LNK & LNK2 fields.
   1838         */
   1839        if (type == PCI_EXP_TYPE_ENDPOINT) {
   1840            vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
   1841                                   PCI_EXP_TYPE_RC_END << 4,
   1842                                   PCI_EXP_FLAGS_TYPE);
   1843
   1844            /* Link Capabilities, Status, and Control goes away */
   1845            if (size > PCI_EXP_LNKCTL) {
   1846                vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
   1847                vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
   1848                vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
   1849
   1850#ifndef PCI_EXP_LNKCAP2
   1851#define PCI_EXP_LNKCAP2 44
   1852#endif
   1853#ifndef PCI_EXP_LNKSTA2
   1854#define PCI_EXP_LNKSTA2 50
   1855#endif
   1856                /* Link 2 Capabilities, Status, and Control goes away */
   1857                if (size > PCI_EXP_LNKCAP2) {
   1858                    vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
   1859                    vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
   1860                    vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
   1861                }
   1862            }
   1863
   1864        } else if (type == PCI_EXP_TYPE_LEG_END) {
   1865            /*
   1866             * Legacy endpoints don't belong on the root complex.  Windows
   1867             * seems to be happier with devices if we skip the capability.
   1868             */
   1869            return 0;
   1870        }
   1871
   1872    } else {
   1873        /*
   1874         * Convert Root Complex Integrated Endpoints to regular endpoints.
   1875         * These devices don't support LNK/LNK2 capabilities, so make them up.
   1876         */
   1877        if (type == PCI_EXP_TYPE_RC_END) {
   1878            vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
   1879                                   PCI_EXP_TYPE_ENDPOINT << 4,
   1880                                   PCI_EXP_FLAGS_TYPE);
   1881            vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
   1882                           QEMU_PCI_EXP_LNKCAP_MLW(QEMU_PCI_EXP_LNK_X1) |
   1883                           QEMU_PCI_EXP_LNKCAP_MLS(QEMU_PCI_EXP_LNK_2_5GT), ~0);
   1884            vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
   1885        }
   1886    }
   1887
   1888    /*
   1889     * Intel 82599 SR-IOV VFs report an invalid PCIe capability version 0
   1890     * (Niantic errate #35) causing Windows to error with a Code 10 for the
   1891     * device on Q35.  Fixup any such devices to report version 1.  If we
   1892     * were to remove the capability entirely the guest would lose extended
   1893     * config space.
   1894     */
   1895    if ((flags & PCI_EXP_FLAGS_VERS) == 0) {
   1896        vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
   1897                               1, PCI_EXP_FLAGS_VERS);
   1898    }
   1899
   1900    pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size,
   1901                             errp);
   1902    if (pos < 0) {
   1903        return pos;
   1904    }
   1905
   1906    vdev->pdev.exp.exp_cap = pos;
   1907
   1908    return pos;
   1909}
   1910
   1911static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
   1912{
   1913    uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
   1914
   1915    if (cap & PCI_EXP_DEVCAP_FLR) {
   1916        trace_vfio_check_pcie_flr(vdev->vbasedev.name);
   1917        vdev->has_flr = true;
   1918    }
   1919}
   1920
   1921static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos)
   1922{
   1923    uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
   1924
   1925    if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
   1926        trace_vfio_check_pm_reset(vdev->vbasedev.name);
   1927        vdev->has_pm_reset = true;
   1928    }
   1929}
   1930
   1931static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
   1932{
   1933    uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
   1934
   1935    if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
   1936        trace_vfio_check_af_flr(vdev->vbasedev.name);
   1937        vdev->has_flr = true;
   1938    }
   1939}
   1940
   1941static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
   1942{
   1943    PCIDevice *pdev = &vdev->pdev;
   1944    uint8_t cap_id, next, size;
   1945    int ret;
   1946
   1947    cap_id = pdev->config[pos];
   1948    next = pdev->config[pos + PCI_CAP_LIST_NEXT];
   1949
   1950    /*
   1951     * If it becomes important to configure capabilities to their actual
   1952     * size, use this as the default when it's something we don't recognize.
   1953     * Since QEMU doesn't actually handle many of the config accesses,
   1954     * exact size doesn't seem worthwhile.
   1955     */
   1956    size = vfio_std_cap_max_size(pdev, pos);
   1957
   1958    /*
   1959     * pci_add_capability always inserts the new capability at the head
   1960     * of the chain.  Therefore to end up with a chain that matches the
   1961     * physical device, we insert from the end by making this recursive.
   1962     * This is also why we pre-calculate size above as cached config space
   1963     * will be changed as we unwind the stack.
   1964     */
   1965    if (next) {
   1966        ret = vfio_add_std_cap(vdev, next, errp);
   1967        if (ret) {
   1968            return ret;
   1969        }
   1970    } else {
   1971        /* Begin the rebuild, use QEMU emulated list bits */
   1972        pdev->config[PCI_CAPABILITY_LIST] = 0;
   1973        vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
   1974        vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
   1975
   1976        ret = vfio_add_virt_caps(vdev, errp);
   1977        if (ret) {
   1978            return ret;
   1979        }
   1980    }
   1981
   1982    /* Scale down size, esp in case virt caps were added above */
   1983    size = MIN(size, vfio_std_cap_max_size(pdev, pos));
   1984
   1985    /* Use emulated next pointer to allow dropping caps */
   1986    pci_set_byte(vdev->emulated_config_bits + pos + PCI_CAP_LIST_NEXT, 0xff);
   1987
   1988    switch (cap_id) {
   1989    case PCI_CAP_ID_MSI:
   1990        ret = vfio_msi_setup(vdev, pos, errp);
   1991        break;
   1992    case PCI_CAP_ID_EXP:
   1993        vfio_check_pcie_flr(vdev, pos);
   1994        ret = vfio_setup_pcie_cap(vdev, pos, size, errp);
   1995        break;
   1996    case PCI_CAP_ID_MSIX:
   1997        ret = vfio_msix_setup(vdev, pos, errp);
   1998        break;
   1999    case PCI_CAP_ID_PM:
   2000        vfio_check_pm_reset(vdev, pos);
   2001        vdev->pm_cap = pos;
   2002        ret = pci_add_capability(pdev, cap_id, pos, size, errp);
   2003        break;
   2004    case PCI_CAP_ID_AF:
   2005        vfio_check_af_flr(vdev, pos);
   2006        ret = pci_add_capability(pdev, cap_id, pos, size, errp);
   2007        break;
   2008    default:
   2009        ret = pci_add_capability(pdev, cap_id, pos, size, errp);
   2010        break;
   2011    }
   2012
   2013    if (ret < 0) {
   2014        error_prepend(errp,
   2015                      "failed to add PCI capability 0x%x[0x%x]@0x%x: ",
   2016                      cap_id, size, pos);
   2017        return ret;
   2018    }
   2019
   2020    return 0;
   2021}
   2022
   2023static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
   2024{
   2025    PCIDevice *pdev = &vdev->pdev;
   2026    uint32_t header;
   2027    uint16_t cap_id, next, size;
   2028    uint8_t cap_ver;
   2029    uint8_t *config;
   2030
   2031    /* Only add extended caps if we have them and the guest can see them */
   2032    if (!pci_is_express(pdev) || !pci_bus_is_express(pci_get_bus(pdev)) ||
   2033        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
   2034        return;
   2035    }
   2036
   2037    /*
   2038     * pcie_add_capability always inserts the new capability at the tail
   2039     * of the chain.  Therefore to end up with a chain that matches the
   2040     * physical device, we cache the config space to avoid overwriting
   2041     * the original config space when we parse the extended capabilities.
   2042     */
   2043    config = g_memdup(pdev->config, vdev->config_size);
   2044
   2045    /*
   2046     * Extended capabilities are chained with each pointing to the next, so we
   2047     * can drop anything other than the head of the chain simply by modifying
   2048     * the previous next pointer.  Seed the head of the chain here such that
   2049     * we can simply skip any capabilities we want to drop below, regardless
   2050     * of their position in the chain.  If this stub capability still exists
   2051     * after we add the capabilities we want to expose, update the capability
   2052     * ID to zero.  Note that we cannot seed with the capability header being
   2053     * zero as this conflicts with definition of an absent capability chain
   2054     * and prevents capabilities beyond the head of the list from being added.
   2055     * By replacing the dummy capability ID with zero after walking the device
   2056     * chain, we also transparently mark extended capabilities as absent if
   2057     * no capabilities were added.  Note that the PCIe spec defines an absence
   2058     * of extended capabilities to be determined by a value of zero for the
   2059     * capability ID, version, AND next pointer.  A non-zero next pointer
   2060     * should be sufficient to indicate additional capabilities are present,
   2061     * which will occur if we call pcie_add_capability() below.  The entire
   2062     * first dword is emulated to support this.
   2063     *
   2064     * NB. The kernel side does similar masking, so be prepared that our
   2065     * view of the device may also contain a capability ID zero in the head
   2066     * of the chain.  Skip it for the same reason that we cannot seed the
   2067     * chain with a zero capability.
   2068     */
   2069    pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
   2070                 PCI_EXT_CAP(0xFFFF, 0, 0));
   2071    pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0);
   2072    pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0);
   2073
   2074    for (next = PCI_CONFIG_SPACE_SIZE; next;
   2075         next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
   2076        header = pci_get_long(config + next);
   2077        cap_id = PCI_EXT_CAP_ID(header);
   2078        cap_ver = PCI_EXT_CAP_VER(header);
   2079
   2080        /*
   2081         * If it becomes important to configure extended capabilities to their
   2082         * actual size, use this as the default when it's something we don't
   2083         * recognize. Since QEMU doesn't actually handle many of the config
   2084         * accesses, exact size doesn't seem worthwhile.
   2085         */
   2086        size = vfio_ext_cap_max_size(config, next);
   2087
   2088        /* Use emulated next pointer to allow dropping extended caps */
   2089        pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
   2090                                   PCI_EXT_CAP_NEXT_MASK);
   2091
   2092        switch (cap_id) {
   2093        case 0: /* kernel masked capability */
   2094        case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */
   2095        case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */
   2096        case PCI_EXT_CAP_ID_REBAR: /* Can't expose read-only */
   2097            trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
   2098            break;
   2099        default:
   2100            pcie_add_capability(pdev, cap_id, cap_ver, next, size);
   2101        }
   2102
   2103    }
   2104
   2105    /* Cleanup chain head ID if necessary */
   2106    if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
   2107        pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
   2108    }
   2109
   2110    g_free(config);
   2111    return;
   2112}
   2113
   2114static int vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
   2115{
   2116    PCIDevice *pdev = &vdev->pdev;
   2117    int ret;
   2118
   2119    if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
   2120        !pdev->config[PCI_CAPABILITY_LIST]) {
   2121        return 0; /* Nothing to add */
   2122    }
   2123
   2124    ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST], errp);
   2125    if (ret) {
   2126        return ret;
   2127    }
   2128
   2129    vfio_add_ext_cap(vdev);
   2130    return 0;
   2131}
   2132
   2133static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
   2134{
   2135    PCIDevice *pdev = &vdev->pdev;
   2136    uint16_t cmd;
   2137
   2138    vfio_disable_interrupts(vdev);
   2139
   2140    /* Make sure the device is in D0 */
   2141    if (vdev->pm_cap) {
   2142        uint16_t pmcsr;
   2143        uint8_t state;
   2144
   2145        pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
   2146        state = pmcsr & PCI_PM_CTRL_STATE_MASK;
   2147        if (state) {
   2148            pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
   2149            vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
   2150            /* vfio handles the necessary delay here */
   2151            pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
   2152            state = pmcsr & PCI_PM_CTRL_STATE_MASK;
   2153            if (state) {
   2154                error_report("vfio: Unable to power on device, stuck in D%d",
   2155                             state);
   2156            }
   2157        }
   2158    }
   2159
   2160    /*
   2161     * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master.
   2162     * Also put INTx Disable in known state.
   2163     */
   2164    cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
   2165    cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
   2166             PCI_COMMAND_INTX_DISABLE);
   2167    vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
   2168}
   2169
   2170static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
   2171{
   2172    Error *err = NULL;
   2173    int nr;
   2174
   2175    vfio_intx_enable(vdev, &err);
   2176    if (err) {
   2177        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
   2178    }
   2179
   2180    for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
   2181        off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr);
   2182        uint32_t val = 0;
   2183        uint32_t len = sizeof(val);
   2184
   2185        if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) {
   2186            error_report("%s(%s) reset bar %d failed: %m", __func__,
   2187                         vdev->vbasedev.name, nr);
   2188        }
   2189    }
   2190
   2191    vfio_quirk_reset(vdev);
   2192}
   2193
   2194static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
   2195{
   2196    char tmp[13];
   2197
   2198    sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain,
   2199            addr->bus, addr->slot, addr->function);
   2200
   2201    return (strcmp(tmp, name) == 0);
   2202}
   2203
   2204static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
   2205{
   2206    VFIOGroup *group;
   2207    struct vfio_pci_hot_reset_info *info;
   2208    struct vfio_pci_dependent_device *devices;
   2209    struct vfio_pci_hot_reset *reset;
   2210    int32_t *fds;
   2211    int ret, i, count;
   2212    bool multi = false;
   2213
   2214    trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
   2215
   2216    if (!single) {
   2217        vfio_pci_pre_reset(vdev);
   2218    }
   2219    vdev->vbasedev.needs_reset = false;
   2220
   2221    info = g_malloc0(sizeof(*info));
   2222    info->argsz = sizeof(*info);
   2223
   2224    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
   2225    if (ret && errno != ENOSPC) {
   2226        ret = -errno;
   2227        if (!vdev->has_pm_reset) {
   2228            error_report("vfio: Cannot reset device %s, "
   2229                         "no available reset mechanism.", vdev->vbasedev.name);
   2230        }
   2231        goto out_single;
   2232    }
   2233
   2234    count = info->count;
   2235    info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
   2236    info->argsz = sizeof(*info) + (count * sizeof(*devices));
   2237    devices = &info->devices[0];
   2238
   2239    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
   2240    if (ret) {
   2241        ret = -errno;
   2242        error_report("vfio: hot reset info failed: %m");
   2243        goto out_single;
   2244    }
   2245
   2246    trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
   2247
   2248    /* Verify that we have all the groups required */
   2249    for (i = 0; i < info->count; i++) {
   2250        PCIHostDeviceAddress host;
   2251        VFIOPCIDevice *tmp;
   2252        VFIODevice *vbasedev_iter;
   2253
   2254        host.domain = devices[i].segment;
   2255        host.bus = devices[i].bus;
   2256        host.slot = PCI_SLOT(devices[i].devfn);
   2257        host.function = PCI_FUNC(devices[i].devfn);
   2258
   2259        trace_vfio_pci_hot_reset_dep_devices(host.domain,
   2260                host.bus, host.slot, host.function, devices[i].group_id);
   2261
   2262        if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
   2263            continue;
   2264        }
   2265
   2266        QLIST_FOREACH(group, &vfio_group_list, next) {
   2267            if (group->groupid == devices[i].group_id) {
   2268                break;
   2269            }
   2270        }
   2271
   2272        if (!group) {
   2273            if (!vdev->has_pm_reset) {
   2274                error_report("vfio: Cannot reset device %s, "
   2275                             "depends on group %d which is not owned.",
   2276                             vdev->vbasedev.name, devices[i].group_id);
   2277            }
   2278            ret = -EPERM;
   2279            goto out;
   2280        }
   2281
   2282        /* Prep dependent devices for reset and clear our marker. */
   2283        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
   2284            if (!vbasedev_iter->dev->realized ||
   2285                vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
   2286                continue;
   2287            }
   2288            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
   2289            if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
   2290                if (single) {
   2291                    ret = -EINVAL;
   2292                    goto out_single;
   2293                }
   2294                vfio_pci_pre_reset(tmp);
   2295                tmp->vbasedev.needs_reset = false;
   2296                multi = true;
   2297                break;
   2298            }
   2299        }
   2300    }
   2301
   2302    if (!single && !multi) {
   2303        ret = -EINVAL;
   2304        goto out_single;
   2305    }
   2306
   2307    /* Determine how many group fds need to be passed */
   2308    count = 0;
   2309    QLIST_FOREACH(group, &vfio_group_list, next) {
   2310        for (i = 0; i < info->count; i++) {
   2311            if (group->groupid == devices[i].group_id) {
   2312                count++;
   2313                break;
   2314            }
   2315        }
   2316    }
   2317
   2318    reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
   2319    reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
   2320    fds = &reset->group_fds[0];
   2321
   2322    /* Fill in group fds */
   2323    QLIST_FOREACH(group, &vfio_group_list, next) {
   2324        for (i = 0; i < info->count; i++) {
   2325            if (group->groupid == devices[i].group_id) {
   2326                fds[reset->count++] = group->fd;
   2327                break;
   2328            }
   2329        }
   2330    }
   2331
   2332    /* Bus reset! */
   2333    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
   2334    g_free(reset);
   2335
   2336    trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
   2337                                    ret ? "%m" : "Success");
   2338
   2339out:
   2340    /* Re-enable INTx on affected devices */
   2341    for (i = 0; i < info->count; i++) {
   2342        PCIHostDeviceAddress host;
   2343        VFIOPCIDevice *tmp;
   2344        VFIODevice *vbasedev_iter;
   2345
   2346        host.domain = devices[i].segment;
   2347        host.bus = devices[i].bus;
   2348        host.slot = PCI_SLOT(devices[i].devfn);
   2349        host.function = PCI_FUNC(devices[i].devfn);
   2350
   2351        if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
   2352            continue;
   2353        }
   2354
   2355        QLIST_FOREACH(group, &vfio_group_list, next) {
   2356            if (group->groupid == devices[i].group_id) {
   2357                break;
   2358            }
   2359        }
   2360
   2361        if (!group) {
   2362            break;
   2363        }
   2364
   2365        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
   2366            if (!vbasedev_iter->dev->realized ||
   2367                vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
   2368                continue;
   2369            }
   2370            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
   2371            if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
   2372                vfio_pci_post_reset(tmp);
   2373                break;
   2374            }
   2375        }
   2376    }
   2377out_single:
   2378    if (!single) {
   2379        vfio_pci_post_reset(vdev);
   2380    }
   2381    g_free(info);
   2382
   2383    return ret;
   2384}
   2385
   2386/*
   2387 * We want to differentiate hot reset of multiple in-use devices vs hot reset
   2388 * of a single in-use device.  VFIO_DEVICE_RESET will already handle the case
   2389 * of doing hot resets when there is only a single device per bus.  The in-use
   2390 * here refers to how many VFIODevices are affected.  A hot reset that affects
   2391 * multiple devices, but only a single in-use device, means that we can call
   2392 * it from our bus ->reset() callback since the extent is effectively a single
   2393 * device.  This allows us to make use of it in the hotplug path.  When there
   2394 * are multiple in-use devices, we can only trigger the hot reset during a
   2395 * system reset and thus from our reset handler.  We separate _one vs _multi
   2396 * here so that we don't overlap and do a double reset on the system reset
   2397 * path where both our reset handler and ->reset() callback are used.  Calling
   2398 * _one() will only do a hot reset for the one in-use devices case, calling
   2399 * _multi() will do nothing if a _one() would have been sufficient.
   2400 */
   2401static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev)
   2402{
   2403    return vfio_pci_hot_reset(vdev, true);
   2404}
   2405
   2406static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev)
   2407{
   2408    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
   2409    return vfio_pci_hot_reset(vdev, false);
   2410}
   2411
   2412static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev)
   2413{
   2414    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
   2415    if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
   2416        vbasedev->needs_reset = true;
   2417    }
   2418}
   2419
   2420static Object *vfio_pci_get_object(VFIODevice *vbasedev)
   2421{
   2422    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
   2423
   2424    return OBJECT(vdev);
   2425}
   2426
   2427static bool vfio_msix_present(void *opaque, int version_id)
   2428{
   2429    PCIDevice *pdev = opaque;
   2430
   2431    return msix_present(pdev);
   2432}
   2433
   2434const VMStateDescription vmstate_vfio_pci_config = {
   2435    .name = "VFIOPCIDevice",
   2436    .version_id = 1,
   2437    .minimum_version_id = 1,
   2438    .fields = (VMStateField[]) {
   2439        VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
   2440        VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present),
   2441        VMSTATE_END_OF_LIST()
   2442    }
   2443};
   2444
   2445static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
   2446{
   2447    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
   2448
   2449    vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL);
   2450}
   2451
   2452static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
   2453{
   2454    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
   2455    PCIDevice *pdev = &vdev->pdev;
   2456    int ret;
   2457
   2458    ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1);
   2459    if (ret) {
   2460        return ret;
   2461    }
   2462
   2463    vfio_pci_write_config(pdev, PCI_COMMAND,
   2464                          pci_get_word(pdev->config + PCI_COMMAND), 2);
   2465
   2466    if (msi_enabled(pdev)) {
   2467        vfio_msi_enable(vdev);
   2468    } else if (msix_enabled(pdev)) {
   2469        vfio_msix_enable(vdev);
   2470    }
   2471
   2472    return ret;
   2473}
   2474
   2475static VFIODeviceOps vfio_pci_ops = {
   2476    .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
   2477    .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
   2478    .vfio_eoi = vfio_intx_eoi,
   2479    .vfio_get_object = vfio_pci_get_object,
   2480    .vfio_save_config = vfio_pci_save_config,
   2481    .vfio_load_config = vfio_pci_load_config,
   2482};
   2483
   2484int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
   2485{
   2486    VFIODevice *vbasedev = &vdev->vbasedev;
   2487    struct vfio_region_info *reg_info;
   2488    int ret;
   2489
   2490    ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, &reg_info);
   2491    if (ret) {
   2492        error_setg_errno(errp, -ret,
   2493                         "failed getting region info for VGA region index %d",
   2494                         VFIO_PCI_VGA_REGION_INDEX);
   2495        return ret;
   2496    }
   2497
   2498    if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) ||
   2499        !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) ||
   2500        reg_info->size < 0xbffff + 1) {
   2501        error_setg(errp, "unexpected VGA info, flags 0x%lx, size 0x%lx",
   2502                   (unsigned long)reg_info->flags,
   2503                   (unsigned long)reg_info->size);
   2504        g_free(reg_info);
   2505        return -EINVAL;
   2506    }
   2507
   2508    vdev->vga = g_new0(VFIOVGA, 1);
   2509
   2510    vdev->vga->fd_offset = reg_info->offset;
   2511    vdev->vga->fd = vdev->vbasedev.fd;
   2512
   2513    g_free(reg_info);
   2514
   2515    vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
   2516    vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
   2517    QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks);
   2518
   2519    memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
   2520                          OBJECT(vdev), &vfio_vga_ops,
   2521                          &vdev->vga->region[QEMU_PCI_VGA_MEM],
   2522                          "vfio-vga-mmio@0xa0000",
   2523                          QEMU_PCI_VGA_MEM_SIZE);
   2524
   2525    vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
   2526    vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
   2527    QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks);
   2528
   2529    memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
   2530                          OBJECT(vdev), &vfio_vga_ops,
   2531                          &vdev->vga->region[QEMU_PCI_VGA_IO_LO],
   2532                          "vfio-vga-io@0x3b0",
   2533                          QEMU_PCI_VGA_IO_LO_SIZE);
   2534
   2535    vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
   2536    vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
   2537    QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks);
   2538
   2539    memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
   2540                          OBJECT(vdev), &vfio_vga_ops,
   2541                          &vdev->vga->region[QEMU_PCI_VGA_IO_HI],
   2542                          "vfio-vga-io@0x3c0",
   2543                          QEMU_PCI_VGA_IO_HI_SIZE);
   2544
   2545    pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
   2546                     &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
   2547                     &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem);
   2548
   2549    return 0;
   2550}
   2551
   2552static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
   2553{
   2554    VFIODevice *vbasedev = &vdev->vbasedev;
   2555    struct vfio_region_info *reg_info;
   2556    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
   2557    int i, ret = -1;
   2558
   2559    /* Sanity check device */
   2560    if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) {
   2561        error_setg(errp, "this isn't a PCI device");
   2562        return;
   2563    }
   2564
   2565    if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
   2566        error_setg(errp, "unexpected number of io regions %u",
   2567                   vbasedev->num_regions);
   2568        return;
   2569    }
   2570
   2571    if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
   2572        error_setg(errp, "unexpected number of irqs %u", vbasedev->num_irqs);
   2573        return;
   2574    }
   2575
   2576    for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
   2577        char *name = g_strdup_printf("%s BAR %d", vbasedev->name, i);
   2578
   2579        ret = vfio_region_setup(OBJECT(vdev), vbasedev,
   2580                                &vdev->bars[i].region, i, name);
   2581        g_free(name);
   2582
   2583        if (ret) {
   2584            error_setg_errno(errp, -ret, "failed to get region %d info", i);
   2585            return;
   2586        }
   2587
   2588        QLIST_INIT(&vdev->bars[i].quirks);
   2589    }
   2590
   2591    ret = vfio_get_region_info(vbasedev,
   2592                               VFIO_PCI_CONFIG_REGION_INDEX, &reg_info);
   2593    if (ret) {
   2594        error_setg_errno(errp, -ret, "failed to get config info");
   2595        return;
   2596    }
   2597
   2598    trace_vfio_populate_device_config(vdev->vbasedev.name,
   2599                                      (unsigned long)reg_info->size,
   2600                                      (unsigned long)reg_info->offset,
   2601                                      (unsigned long)reg_info->flags);
   2602
   2603    vdev->config_size = reg_info->size;
   2604    if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
   2605        vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
   2606    }
   2607    vdev->config_offset = reg_info->offset;
   2608
   2609    g_free(reg_info);
   2610
   2611    if (vdev->features & VFIO_FEATURE_ENABLE_VGA) {
   2612        ret = vfio_populate_vga(vdev, errp);
   2613        if (ret) {
   2614            error_append_hint(errp, "device does not support "
   2615                              "requested feature x-vga\n");
   2616            return;
   2617        }
   2618    }
   2619
   2620    irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
   2621
   2622    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
   2623    if (ret) {
   2624        /* This can fail for an old kernel or legacy PCI dev */
   2625        trace_vfio_populate_device_get_irq_info_failure(strerror(errno));
   2626    } else if (irq_info.count == 1) {
   2627        vdev->pci_aer = true;
   2628    } else {
   2629        warn_report(VFIO_MSG_PREFIX
   2630                    "Could not enable error recovery for the device",
   2631                    vbasedev->name);
   2632    }
   2633}
   2634
   2635static void vfio_put_device(VFIOPCIDevice *vdev)
   2636{
   2637    g_free(vdev->vbasedev.name);
   2638    g_free(vdev->msix);
   2639
   2640    vfio_put_base_device(&vdev->vbasedev);
   2641}
   2642
   2643static void vfio_err_notifier_handler(void *opaque)
   2644{
   2645    VFIOPCIDevice *vdev = opaque;
   2646
   2647    if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
   2648        return;
   2649    }
   2650
   2651    /*
   2652     * TBD. Retrieve the error details and decide what action
   2653     * needs to be taken. One of the actions could be to pass
   2654     * the error to the guest and have the guest driver recover
   2655     * from the error. This requires that PCIe capabilities be
   2656     * exposed to the guest. For now, we just terminate the
   2657     * guest to contain the error.
   2658     */
   2659
   2660    error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
   2661
   2662    vm_stop(RUN_STATE_INTERNAL_ERROR);
   2663}
   2664
   2665/*
   2666 * Registers error notifier for devices supporting error recovery.
   2667 * If we encounter a failure in this function, we report an error
   2668 * and continue after disabling error recovery support for the
   2669 * device.
   2670 */
   2671static void vfio_register_err_notifier(VFIOPCIDevice *vdev)
   2672{
   2673    Error *err = NULL;
   2674    int32_t fd;
   2675
   2676    if (!vdev->pci_aer) {
   2677        return;
   2678    }
   2679
   2680    if (event_notifier_init(&vdev->err_notifier, 0)) {
   2681        error_report("vfio: Unable to init event notifier for error detection");
   2682        vdev->pci_aer = false;
   2683        return;
   2684    }
   2685
   2686    fd = event_notifier_get_fd(&vdev->err_notifier);
   2687    qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev);
   2688
   2689    if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
   2690                               VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
   2691        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
   2692        qemu_set_fd_handler(fd, NULL, NULL, vdev);
   2693        event_notifier_cleanup(&vdev->err_notifier);
   2694        vdev->pci_aer = false;
   2695    }
   2696}
   2697
   2698static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
   2699{
   2700    Error *err = NULL;
   2701
   2702    if (!vdev->pci_aer) {
   2703        return;
   2704    }
   2705
   2706    if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
   2707                               VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
   2708        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
   2709    }
   2710    qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
   2711                        NULL, NULL, vdev);
   2712    event_notifier_cleanup(&vdev->err_notifier);
   2713}
   2714
   2715static void vfio_req_notifier_handler(void *opaque)
   2716{
   2717    VFIOPCIDevice *vdev = opaque;
   2718    Error *err = NULL;
   2719
   2720    if (!event_notifier_test_and_clear(&vdev->req_notifier)) {
   2721        return;
   2722    }
   2723
   2724    qdev_unplug(DEVICE(vdev), &err);
   2725    if (err) {
   2726        warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
   2727    }
   2728}
   2729
   2730static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
   2731{
   2732    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
   2733                                      .index = VFIO_PCI_REQ_IRQ_INDEX };
   2734    Error *err = NULL;
   2735    int32_t fd;
   2736
   2737    if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) {
   2738        return;
   2739    }
   2740
   2741    if (ioctl(vdev->vbasedev.fd,
   2742              VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) {
   2743        return;
   2744    }
   2745
   2746    if (event_notifier_init(&vdev->req_notifier, 0)) {
   2747        error_report("vfio: Unable to init event notifier for device request");
   2748        return;
   2749    }
   2750
   2751    fd = event_notifier_get_fd(&vdev->req_notifier);
   2752    qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev);
   2753
   2754    if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
   2755                           VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
   2756        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
   2757        qemu_set_fd_handler(fd, NULL, NULL, vdev);
   2758        event_notifier_cleanup(&vdev->req_notifier);
   2759    } else {
   2760        vdev->req_enabled = true;
   2761    }
   2762}
   2763
   2764static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
   2765{
   2766    Error *err = NULL;
   2767
   2768    if (!vdev->req_enabled) {
   2769        return;
   2770    }
   2771
   2772    if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
   2773                               VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
   2774        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
   2775    }
   2776    qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier),
   2777                        NULL, NULL, vdev);
   2778    event_notifier_cleanup(&vdev->req_notifier);
   2779
   2780    vdev->req_enabled = false;
   2781}
   2782
   2783static void vfio_realize(PCIDevice *pdev, Error **errp)
   2784{
   2785    VFIOPCIDevice *vdev = VFIO_PCI(pdev);
   2786    VFIODevice *vbasedev_iter;
   2787    VFIOGroup *group;
   2788    char *tmp, *subsys, group_path[PATH_MAX], *group_name;
   2789    Error *err = NULL;
   2790    ssize_t len;
   2791    struct stat st;
   2792    int groupid;
   2793    int i, ret;
   2794    bool is_mdev;
   2795
   2796    if (!vdev->vbasedev.sysfsdev) {
   2797        if (!(~vdev->host.domain || ~vdev->host.bus ||
   2798              ~vdev->host.slot || ~vdev->host.function)) {
   2799            error_setg(errp, "No provided host device");
   2800            error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
   2801                              "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
   2802            return;
   2803        }
   2804        vdev->vbasedev.sysfsdev =
   2805            g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x",
   2806                            vdev->host.domain, vdev->host.bus,
   2807                            vdev->host.slot, vdev->host.function);
   2808    }
   2809
   2810    if (stat(vdev->vbasedev.sysfsdev, &st) < 0) {
   2811        error_setg_errno(errp, errno, "no such host device");
   2812        error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.sysfsdev);
   2813        return;
   2814    }
   2815
   2816    vdev->vbasedev.name = g_path_get_basename(vdev->vbasedev.sysfsdev);
   2817    vdev->vbasedev.ops = &vfio_pci_ops;
   2818    vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI;
   2819    vdev->vbasedev.dev = DEVICE(vdev);
   2820
   2821    tmp = g_strdup_printf("%s/iommu_group", vdev->vbasedev.sysfsdev);
   2822    len = readlink(tmp, group_path, sizeof(group_path));
   2823    g_free(tmp);
   2824
   2825    if (len <= 0 || len >= sizeof(group_path)) {
   2826        error_setg_errno(errp, len < 0 ? errno : ENAMETOOLONG,
   2827                         "no iommu_group found");
   2828        goto error;
   2829    }
   2830
   2831    group_path[len] = 0;
   2832
   2833    group_name = basename(group_path);
   2834    if (sscanf(group_name, "%d", &groupid) != 1) {
   2835        error_setg_errno(errp, errno, "failed to read %s", group_path);
   2836        goto error;
   2837    }
   2838
   2839    trace_vfio_realize(vdev->vbasedev.name, groupid);
   2840
   2841    group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev), errp);
   2842    if (!group) {
   2843        goto error;
   2844    }
   2845
   2846    QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
   2847        if (strcmp(vbasedev_iter->name, vdev->vbasedev.name) == 0) {
   2848            error_setg(errp, "device is already attached");
   2849            vfio_put_group(group);
   2850            goto error;
   2851        }
   2852    }
   2853
   2854    /*
   2855     * Mediated devices *might* operate compatibly with discarding of RAM, but
   2856     * we cannot know for certain, it depends on whether the mdev vendor driver
   2857     * stays in sync with the active working set of the guest driver.  Prevent
   2858     * the x-balloon-allowed option unless this is minimally an mdev device.
   2859     */
   2860    tmp = g_strdup_printf("%s/subsystem", vdev->vbasedev.sysfsdev);
   2861    subsys = realpath(tmp, NULL);
   2862    g_free(tmp);
   2863    is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0);
   2864    free(subsys);
   2865
   2866    trace_vfio_mdev(vdev->vbasedev.name, is_mdev);
   2867
   2868    if (vdev->vbasedev.ram_block_discard_allowed && !is_mdev) {
   2869        error_setg(errp, "x-balloon-allowed only potentially compatible "
   2870                   "with mdev devices");
   2871        vfio_put_group(group);
   2872        goto error;
   2873    }
   2874
   2875    ret = vfio_get_device(group, vdev->vbasedev.name, &vdev->vbasedev, errp);
   2876    if (ret) {
   2877        vfio_put_group(group);
   2878        goto error;
   2879    }
   2880
   2881    vfio_populate_device(vdev, &err);
   2882    if (err) {
   2883        error_propagate(errp, err);
   2884        goto error;
   2885    }
   2886
   2887    /* Get a copy of config space */
   2888    ret = pread(vdev->vbasedev.fd, vdev->pdev.config,
   2889                MIN(pci_config_size(&vdev->pdev), vdev->config_size),
   2890                vdev->config_offset);
   2891    if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
   2892        ret = ret < 0 ? -errno : -EFAULT;
   2893        error_setg_errno(errp, -ret, "failed to read device config space");
   2894        goto error;
   2895    }
   2896
   2897    /* vfio emulates a lot for us, but some bits need extra love */
   2898    vdev->emulated_config_bits = g_malloc0(vdev->config_size);
   2899
   2900    /* QEMU can choose to expose the ROM or not */
   2901    memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
   2902    /* QEMU can also add or extend BARs */
   2903    memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
   2904
   2905    /*
   2906     * The PCI spec reserves vendor ID 0xffff as an invalid value.  The
   2907     * device ID is managed by the vendor and need only be a 16-bit value.
   2908     * Allow any 16-bit value for subsystem so they can be hidden or changed.
   2909     */
   2910    if (vdev->vendor_id != PCI_ANY_ID) {
   2911        if (vdev->vendor_id >= 0xffff) {
   2912            error_setg(errp, "invalid PCI vendor ID provided");
   2913            goto error;
   2914        }
   2915        vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
   2916        trace_vfio_pci_emulated_vendor_id(vdev->vbasedev.name, vdev->vendor_id);
   2917    } else {
   2918        vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
   2919    }
   2920
   2921    if (vdev->device_id != PCI_ANY_ID) {
   2922        if (vdev->device_id > 0xffff) {
   2923            error_setg(errp, "invalid PCI device ID provided");
   2924            goto error;
   2925        }
   2926        vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
   2927        trace_vfio_pci_emulated_device_id(vdev->vbasedev.name, vdev->device_id);
   2928    } else {
   2929        vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
   2930    }
   2931
   2932    if (vdev->sub_vendor_id != PCI_ANY_ID) {
   2933        if (vdev->sub_vendor_id > 0xffff) {
   2934            error_setg(errp, "invalid PCI subsystem vendor ID provided");
   2935            goto error;
   2936        }
   2937        vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
   2938                               vdev->sub_vendor_id, ~0);
   2939        trace_vfio_pci_emulated_sub_vendor_id(vdev->vbasedev.name,
   2940                                              vdev->sub_vendor_id);
   2941    }
   2942
   2943    if (vdev->sub_device_id != PCI_ANY_ID) {
   2944        if (vdev->sub_device_id > 0xffff) {
   2945            error_setg(errp, "invalid PCI subsystem device ID provided");
   2946            goto error;
   2947        }
   2948        vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
   2949        trace_vfio_pci_emulated_sub_device_id(vdev->vbasedev.name,
   2950                                              vdev->sub_device_id);
   2951    }
   2952
   2953    /* QEMU can change multi-function devices to single function, or reverse */
   2954    vdev->emulated_config_bits[PCI_HEADER_TYPE] =
   2955                                              PCI_HEADER_TYPE_MULTI_FUNCTION;
   2956
   2957    /* Restore or clear multifunction, this is always controlled by QEMU */
   2958    if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
   2959        vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
   2960    } else {
   2961        vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
   2962    }
   2963
   2964    /*
   2965     * Clear host resource mapping info.  If we choose not to register a
   2966     * BAR, such as might be the case with the option ROM, we can get
   2967     * confusing, unwritable, residual addresses from the host here.
   2968     */
   2969    memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
   2970    memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
   2971
   2972    vfio_pci_size_rom(vdev);
   2973
   2974    vfio_bars_prepare(vdev);
   2975
   2976    vfio_msix_early_setup(vdev, &err);
   2977    if (err) {
   2978        error_propagate(errp, err);
   2979        goto error;
   2980    }
   2981
   2982    vfio_bars_register(vdev);
   2983
   2984    ret = vfio_add_capabilities(vdev, errp);
   2985    if (ret) {
   2986        goto out_teardown;
   2987    }
   2988
   2989    if (vdev->vga) {
   2990        vfio_vga_quirk_setup(vdev);
   2991    }
   2992
   2993    for (i = 0; i < PCI_ROM_SLOT; i++) {
   2994        vfio_bar_quirk_setup(vdev, i);
   2995    }
   2996
   2997    if (!vdev->igd_opregion &&
   2998        vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) {
   2999        struct vfio_region_info *opregion;
   3000
   3001        if (vdev->pdev.qdev.hotplugged) {
   3002            error_setg(errp,
   3003                       "cannot support IGD OpRegion feature on hotplugged "
   3004                       "device");
   3005            goto out_teardown;
   3006        }
   3007
   3008        ret = vfio_get_dev_region_info(&vdev->vbasedev,
   3009                        VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
   3010                        VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion);
   3011        if (ret) {
   3012            error_setg_errno(errp, -ret,
   3013                             "does not support requested IGD OpRegion feature");
   3014            goto out_teardown;
   3015        }
   3016
   3017        ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);
   3018        g_free(opregion);
   3019        if (ret) {
   3020            goto out_teardown;
   3021        }
   3022    }
   3023
   3024    /* QEMU emulates all of MSI & MSIX */
   3025    if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
   3026        memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
   3027               MSIX_CAP_LENGTH);
   3028    }
   3029
   3030    if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
   3031        memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
   3032               vdev->msi_cap_size);
   3033    }
   3034
   3035    if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
   3036        vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
   3037                                                  vfio_intx_mmap_enable, vdev);
   3038        pci_device_set_intx_routing_notifier(&vdev->pdev,
   3039                                             vfio_intx_routing_notifier);
   3040        vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
   3041        kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
   3042        ret = vfio_intx_enable(vdev, errp);
   3043        if (ret) {
   3044            goto out_deregister;
   3045        }
   3046    }
   3047
   3048    if (vdev->display != ON_OFF_AUTO_OFF) {
   3049        ret = vfio_display_probe(vdev, errp);
   3050        if (ret) {
   3051            goto out_deregister;
   3052        }
   3053    }
   3054    if (vdev->enable_ramfb && vdev->dpy == NULL) {
   3055        error_setg(errp, "ramfb=on requires display=on");
   3056        goto out_deregister;
   3057    }
   3058    if (vdev->display_xres || vdev->display_yres) {
   3059        if (vdev->dpy == NULL) {
   3060            error_setg(errp, "xres and yres properties require display=on");
   3061            goto out_deregister;
   3062        }
   3063        if (vdev->dpy->edid_regs == NULL) {
   3064            error_setg(errp, "xres and yres properties need edid support");
   3065            goto out_deregister;
   3066        }
   3067    }
   3068
   3069    if (vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) {
   3070        ret = vfio_pci_nvidia_v100_ram_init(vdev, errp);
   3071        if (ret && ret != -ENODEV) {
   3072            error_report("Failed to setup NVIDIA V100 GPU RAM");
   3073        }
   3074    }
   3075
   3076    if (vfio_pci_is(vdev, PCI_VENDOR_ID_IBM, PCI_ANY_ID)) {
   3077        ret = vfio_pci_nvlink2_init(vdev, errp);
   3078        if (ret && ret != -ENODEV) {
   3079            error_report("Failed to setup NVlink2 bridge");
   3080        }
   3081    }
   3082
   3083    if (!pdev->failover_pair_id) {
   3084        ret = vfio_migration_probe(&vdev->vbasedev, errp);
   3085        if (ret) {
   3086            error_report("%s: Migration disabled", vdev->vbasedev.name);
   3087        }
   3088    }
   3089
   3090    vfio_register_err_notifier(vdev);
   3091    vfio_register_req_notifier(vdev);
   3092    vfio_setup_resetfn_quirk(vdev);
   3093
   3094    return;
   3095
   3096out_deregister:
   3097    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
   3098    kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
   3099out_teardown:
   3100    vfio_teardown_msi(vdev);
   3101    vfio_bars_exit(vdev);
   3102error:
   3103    error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name);
   3104}
   3105
   3106static void vfio_instance_finalize(Object *obj)
   3107{
   3108    VFIOPCIDevice *vdev = VFIO_PCI(obj);
   3109    VFIOGroup *group = vdev->vbasedev.group;
   3110
   3111    vfio_display_finalize(vdev);
   3112    vfio_bars_finalize(vdev);
   3113    g_free(vdev->emulated_config_bits);
   3114    g_free(vdev->rom);
   3115    /*
   3116     * XXX Leaking igd_opregion is not an oversight, we can't remove the
   3117     * fw_cfg entry therefore leaking this allocation seems like the safest
   3118     * option.
   3119     *
   3120     * g_free(vdev->igd_opregion);
   3121     */
   3122    vfio_put_device(vdev);
   3123    vfio_put_group(group);
   3124}
   3125
   3126static void vfio_exitfn(PCIDevice *pdev)
   3127{
   3128    VFIOPCIDevice *vdev = VFIO_PCI(pdev);
   3129
   3130    vfio_unregister_req_notifier(vdev);
   3131    vfio_unregister_err_notifier(vdev);
   3132    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
   3133    if (vdev->irqchip_change_notifier.notify) {
   3134        kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
   3135    }
   3136    vfio_disable_interrupts(vdev);
   3137    if (vdev->intx.mmap_timer) {
   3138        timer_free(vdev->intx.mmap_timer);
   3139    }
   3140    vfio_teardown_msi(vdev);
   3141    vfio_bars_exit(vdev);
   3142    vfio_migration_finalize(&vdev->vbasedev);
   3143}
   3144
   3145static void vfio_pci_reset(DeviceState *dev)
   3146{
   3147    VFIOPCIDevice *vdev = VFIO_PCI(dev);
   3148
   3149    trace_vfio_pci_reset(vdev->vbasedev.name);
   3150
   3151    vfio_pci_pre_reset(vdev);
   3152
   3153    if (vdev->display != ON_OFF_AUTO_OFF) {
   3154        vfio_display_reset(vdev);
   3155    }
   3156
   3157    if (vdev->resetfn && !vdev->resetfn(vdev)) {
   3158        goto post_reset;
   3159    }
   3160
   3161    if (vdev->vbasedev.reset_works &&
   3162        (vdev->has_flr || !vdev->has_pm_reset) &&
   3163        !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
   3164        trace_vfio_pci_reset_flr(vdev->vbasedev.name);
   3165        goto post_reset;
   3166    }
   3167
   3168    /* See if we can do our own bus reset */
   3169    if (!vfio_pci_hot_reset_one(vdev)) {
   3170        goto post_reset;
   3171    }
   3172
   3173    /* If nothing else works and the device supports PM reset, use it */
   3174    if (vdev->vbasedev.reset_works && vdev->has_pm_reset &&
   3175        !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
   3176        trace_vfio_pci_reset_pm(vdev->vbasedev.name);
   3177        goto post_reset;
   3178    }
   3179
   3180post_reset:
   3181    vfio_pci_post_reset(vdev);
   3182}
   3183
   3184static void vfio_instance_init(Object *obj)
   3185{
   3186    PCIDevice *pci_dev = PCI_DEVICE(obj);
   3187    VFIOPCIDevice *vdev = VFIO_PCI(obj);
   3188
   3189    device_add_bootindex_property(obj, &vdev->bootindex,
   3190                                  "bootindex", NULL,
   3191                                  &pci_dev->qdev);
   3192    vdev->host.domain = ~0U;
   3193    vdev->host.bus = ~0U;
   3194    vdev->host.slot = ~0U;
   3195    vdev->host.function = ~0U;
   3196
   3197    vdev->nv_gpudirect_clique = 0xFF;
   3198
   3199    /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command
   3200     * line, therefore, no need to wait to realize like other devices */
   3201    pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
   3202}
   3203
   3204static Property vfio_pci_dev_properties[] = {
   3205    DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
   3206    DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
   3207    DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice,
   3208                            vbasedev.pre_copy_dirty_page_tracking,
   3209                            ON_OFF_AUTO_ON),
   3210    DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
   3211                            display, ON_OFF_AUTO_OFF),
   3212    DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
   3213    DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
   3214    DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
   3215                       intx.mmap_timeout, 1100),
   3216    DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
   3217                    VFIO_FEATURE_ENABLE_VGA_BIT, false),
   3218    DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
   3219                    VFIO_FEATURE_ENABLE_REQ_BIT, true),
   3220    DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
   3221                    VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
   3222    DEFINE_PROP_BOOL("x-enable-migration", VFIOPCIDevice,
   3223                     vbasedev.enable_migration, false),
   3224    DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
   3225    DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
   3226                     vbasedev.ram_block_discard_allowed, false),
   3227    DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
   3228    DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
   3229    DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
   3230    DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
   3231                     no_geforce_quirks, false),
   3232    DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
   3233                     false),
   3234    DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
   3235                     false),
   3236    DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
   3237    DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
   3238    DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
   3239                       sub_vendor_id, PCI_ANY_ID),
   3240    DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
   3241                       sub_device_id, PCI_ANY_ID),
   3242    DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
   3243    DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
   3244                                   nv_gpudirect_clique,
   3245                                   qdev_prop_nv_gpudirect_clique, uint8_t),
   3246    DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
   3247                                OFF_AUTOPCIBAR_OFF),
   3248    /*
   3249     * TODO - support passed fds... is this necessary?
   3250     * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
   3251     * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name),
   3252     */
   3253    DEFINE_PROP_END_OF_LIST(),
   3254};
   3255
   3256static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
   3257{
   3258    DeviceClass *dc = DEVICE_CLASS(klass);
   3259    PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
   3260
   3261    dc->reset = vfio_pci_reset;
   3262    device_class_set_props(dc, vfio_pci_dev_properties);
   3263    dc->desc = "VFIO-based PCI device assignment";
   3264    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
   3265    pdc->realize = vfio_realize;
   3266    pdc->exit = vfio_exitfn;
   3267    pdc->config_read = vfio_pci_read_config;
   3268    pdc->config_write = vfio_pci_write_config;
   3269}
   3270
   3271static const TypeInfo vfio_pci_dev_info = {
   3272    .name = TYPE_VFIO_PCI,
   3273    .parent = TYPE_PCI_DEVICE,
   3274    .instance_size = sizeof(VFIOPCIDevice),
   3275    .class_init = vfio_pci_dev_class_init,
   3276    .instance_init = vfio_instance_init,
   3277    .instance_finalize = vfio_instance_finalize,
   3278    .interfaces = (InterfaceInfo[]) {
   3279        { INTERFACE_PCIE_DEVICE },
   3280        { INTERFACE_CONVENTIONAL_PCI_DEVICE },
   3281        { }
   3282    },
   3283};
   3284
   3285static Property vfio_pci_dev_nohotplug_properties[] = {
   3286    DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false),
   3287    DEFINE_PROP_END_OF_LIST(),
   3288};
   3289
   3290static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data)
   3291{
   3292    DeviceClass *dc = DEVICE_CLASS(klass);
   3293
   3294    device_class_set_props(dc, vfio_pci_dev_nohotplug_properties);
   3295    dc->hotpluggable = false;
   3296}
   3297
   3298static const TypeInfo vfio_pci_nohotplug_dev_info = {
   3299    .name = TYPE_VFIO_PCI_NOHOTPLUG,
   3300    .parent = TYPE_VFIO_PCI,
   3301    .instance_size = sizeof(VFIOPCIDevice),
   3302    .class_init = vfio_pci_nohotplug_dev_class_init,
   3303};
   3304
   3305static void register_vfio_pci_dev_type(void)
   3306{
   3307    type_register_static(&vfio_pci_dev_info);
   3308    type_register_static(&vfio_pci_nohotplug_dev_info);
   3309}
   3310
   3311type_init(register_vfio_pci_dev_type)