cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

vfio-helpers.c (26644B)


      1/*
      2 * VFIO utility
      3 *
      4 * Copyright 2016 - 2018 Red Hat, Inc.
      5 *
      6 * Authors:
      7 *   Fam Zheng <famz@redhat.com>
      8 *
      9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
     10 * See the COPYING file in the top-level directory.
     11 */
     12
     13#include "qemu/osdep.h"
     14#include <sys/ioctl.h>
     15#include <linux/vfio.h>
     16#include "qapi/error.h"
     17#include "exec/ramlist.h"
     18#include "exec/cpu-common.h"
     19#include "exec/memory.h"
     20#include "trace.h"
     21#include "qemu/error-report.h"
     22#include "standard-headers/linux/pci_regs.h"
     23#include "qemu/event_notifier.h"
     24#include "qemu/vfio-helpers.h"
     25#include "qemu/lockable.h"
     26#include "trace.h"
     27
     28#define QEMU_VFIO_DEBUG 0
     29
     30#define QEMU_VFIO_IOVA_MIN 0x10000ULL
     31/* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
     32 * we can use a runtime limit; alternatively it's also possible to do platform
     33 * specific detection by reading sysfs entries. Until then, 39 is a safe bet.
     34 **/
     35#define QEMU_VFIO_IOVA_MAX (1ULL << 39)
     36
     37typedef struct {
     38    /* Page aligned addr. */
     39    void *host;
     40    size_t size;
     41    uint64_t iova;
     42} IOVAMapping;
     43
     44struct IOVARange {
     45    uint64_t start;
     46    uint64_t end;
     47};
     48
     49struct QEMUVFIOState {
     50    QemuMutex lock;
     51
     52    /* These fields are protected by BQL */
     53    int container;
     54    int group;
     55    int device;
     56    RAMBlockNotifier ram_notifier;
     57    struct vfio_region_info config_region_info, bar_region_info[6];
     58    struct IOVARange *usable_iova_ranges;
     59    uint8_t nb_iova_ranges;
     60
     61    /* These fields are protected by @lock */
     62    /* VFIO's IO virtual address space is managed by splitting into a few
     63     * sections:
     64     *
     65     * ---------------       <= 0
     66     * |xxxxxxxxxxxxx|
     67     * |-------------|       <= QEMU_VFIO_IOVA_MIN
     68     * |             |
     69     * |    Fixed    |
     70     * |             |
     71     * |-------------|       <= low_water_mark
     72     * |             |
     73     * |    Free     |
     74     * |             |
     75     * |-------------|       <= high_water_mark
     76     * |             |
     77     * |    Temp     |
     78     * |             |
     79     * |-------------|       <= QEMU_VFIO_IOVA_MAX
     80     * |xxxxxxxxxxxxx|
     81     * |xxxxxxxxxxxxx|
     82     * ---------------
     83     *
     84     * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
     85     *
     86     * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
     87     *   [QEMU_VFIO_IOVA_MIN, low_water_mark).  Once allocated they will not be
     88     *   reclaimed - low_water_mark never shrinks;
     89     *
     90     * - IOVAs in range [low_water_mark, high_water_mark) are free;
     91     *
     92     * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
     93     *   mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
     94     *   is recycled. The caller should make sure I/O's depending on these
     95     *   mappings are completed before calling.
     96     **/
     97    uint64_t low_water_mark;
     98    uint64_t high_water_mark;
     99    IOVAMapping *mappings;
    100    int nr_mappings;
    101};
    102
    103/**
    104 * Find group file by PCI device address as specified @device, and return the
    105 * path. The returned string is owned by caller and should be g_free'ed later.
    106 */
    107static char *sysfs_find_group_file(const char *device, Error **errp)
    108{
    109    char *sysfs_link;
    110    char *sysfs_group;
    111    char *p;
    112    char *path = NULL;
    113
    114    sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device);
    115    sysfs_group = g_malloc0(PATH_MAX);
    116    if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) {
    117        error_setg_errno(errp, errno, "Failed to find iommu group sysfs path");
    118        goto out;
    119    }
    120    p = strrchr(sysfs_group, '/');
    121    if (!p) {
    122        error_setg(errp, "Failed to find iommu group number");
    123        goto out;
    124    }
    125
    126    path = g_strdup_printf("/dev/vfio/%s", p + 1);
    127out:
    128    g_free(sysfs_link);
    129    g_free(sysfs_group);
    130    return path;
    131}
    132
    133static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
    134{
    135    assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info));
    136}
    137
    138static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
    139{
    140    g_autofree char *barname = NULL;
    141    assert_bar_index_valid(s, index);
    142    s->bar_region_info[index] = (struct vfio_region_info) {
    143        .index = VFIO_PCI_BAR0_REGION_INDEX + index,
    144        .argsz = sizeof(struct vfio_region_info),
    145    };
    146    if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) {
    147        error_setg_errno(errp, errno, "Failed to get BAR region info");
    148        return -errno;
    149    }
    150    barname = g_strdup_printf("bar[%d]", index);
    151    trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset,
    152                                s->bar_region_info[index].size,
    153                                s->bar_region_info[index].cap_offset);
    154
    155    return 0;
    156}
    157
    158/**
    159 * Map a PCI bar area.
    160 */
    161void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
    162                            uint64_t offset, uint64_t size, int prot,
    163                            Error **errp)
    164{
    165    void *p;
    166    assert(QEMU_IS_ALIGNED(offset, qemu_real_host_page_size));
    167    assert_bar_index_valid(s, index);
    168    p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
    169             prot, MAP_SHARED,
    170             s->device, s->bar_region_info[index].offset + offset);
    171    trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset ,
    172                                size, offset, p);
    173    if (p == MAP_FAILED) {
    174        error_setg_errno(errp, errno, "Failed to map BAR region");
    175        p = NULL;
    176    }
    177    return p;
    178}
    179
    180/**
    181 * Unmap a PCI bar area.
    182 */
    183void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar,
    184                             uint64_t offset, uint64_t size)
    185{
    186    if (bar) {
    187        munmap(bar, MIN(size, s->bar_region_info[index].size - offset));
    188    }
    189}
    190
    191/**
    192 * Initialize device IRQ with @irq_type and register an event notifier.
    193 */
    194int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
    195                           int irq_type, Error **errp)
    196{
    197    int r;
    198    struct vfio_irq_set *irq_set;
    199    size_t irq_set_size;
    200    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
    201
    202    irq_info.index = irq_type;
    203    if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) {
    204        error_setg_errno(errp, errno, "Failed to get device interrupt info");
    205        return -errno;
    206    }
    207    if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
    208        error_setg(errp, "Device interrupt doesn't support eventfd");
    209        return -EINVAL;
    210    }
    211
    212    irq_set_size = sizeof(*irq_set) + sizeof(int);
    213    irq_set = g_malloc0(irq_set_size);
    214
    215    /* Get to a known IRQ state */
    216    *irq_set = (struct vfio_irq_set) {
    217        .argsz = irq_set_size,
    218        .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
    219        .index = irq_info.index,
    220        .start = 0,
    221        .count = 1,
    222    };
    223
    224    *(int *)&irq_set->data = event_notifier_get_fd(e);
    225    r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set);
    226    g_free(irq_set);
    227    if (r) {
    228        error_setg_errno(errp, errno, "Failed to setup device interrupt");
    229        return -errno;
    230    }
    231    return 0;
    232}
    233
    234static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
    235                                     int size, int ofs)
    236{
    237    int ret;
    238
    239    trace_qemu_vfio_pci_read_config(buf, ofs, size,
    240                                    s->config_region_info.offset,
    241                                    s->config_region_info.size);
    242    assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
    243    do {
    244        ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
    245    } while (ret == -1 && errno == EINTR);
    246    return ret == size ? 0 : -errno;
    247}
    248
    249static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs)
    250{
    251    int ret;
    252
    253    trace_qemu_vfio_pci_write_config(buf, ofs, size,
    254                                     s->config_region_info.offset,
    255                                     s->config_region_info.size);
    256    assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
    257    do {
    258        ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
    259    } while (ret == -1 && errno == EINTR);
    260    return ret == size ? 0 : -errno;
    261}
    262
    263static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf)
    264{
    265    struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf;
    266    struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset;
    267    struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
    268    int i;
    269
    270    while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
    271        if (!cap->next) {
    272            return;
    273        }
    274        cap = (struct vfio_info_cap_header *)(buf + cap->next);
    275    }
    276
    277    cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
    278
    279    s->nb_iova_ranges = cap_iova_range->nr_iovas;
    280    if (s->nb_iova_ranges > 1) {
    281        s->usable_iova_ranges =
    282            g_realloc(s->usable_iova_ranges,
    283                      s->nb_iova_ranges * sizeof(struct IOVARange));
    284    }
    285
    286    for (i = 0; i < s->nb_iova_ranges; i++) {
    287        s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
    288        s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
    289    }
    290}
    291
    292static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
    293                              Error **errp)
    294{
    295    int ret;
    296    int i;
    297    uint16_t pci_cmd;
    298    struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
    299    struct vfio_iommu_type1_info *iommu_info = NULL;
    300    size_t iommu_info_size = sizeof(*iommu_info);
    301    struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
    302    char *group_file = NULL;
    303
    304    s->usable_iova_ranges = NULL;
    305
    306    /* Create a new container */
    307    s->container = open("/dev/vfio/vfio", O_RDWR);
    308
    309    if (s->container == -1) {
    310        error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio");
    311        return -errno;
    312    }
    313    if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) {
    314        error_setg(errp, "Invalid VFIO version");
    315        ret = -EINVAL;
    316        goto fail_container;
    317    }
    318
    319    if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
    320        error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported");
    321        ret = -EINVAL;
    322        goto fail_container;
    323    }
    324
    325    /* Open the group */
    326    group_file = sysfs_find_group_file(device, errp);
    327    if (!group_file) {
    328        ret = -EINVAL;
    329        goto fail_container;
    330    }
    331
    332    s->group = open(group_file, O_RDWR);
    333    if (s->group == -1) {
    334        error_setg_errno(errp, errno, "Failed to open VFIO group file: %s",
    335                         group_file);
    336        g_free(group_file);
    337        ret = -errno;
    338        goto fail_container;
    339    }
    340    g_free(group_file);
    341
    342    /* Test the group is viable and available */
    343    if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) {
    344        error_setg_errno(errp, errno, "Failed to get VFIO group status");
    345        ret = -errno;
    346        goto fail;
    347    }
    348
    349    if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
    350        error_setg(errp, "VFIO group is not viable");
    351        ret = -EINVAL;
    352        goto fail;
    353    }
    354
    355    /* Add the group to the container */
    356    if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) {
    357        error_setg_errno(errp, errno, "Failed to add group to VFIO container");
    358        ret = -errno;
    359        goto fail;
    360    }
    361
    362    /* Enable the IOMMU model we want */
    363    if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) {
    364        error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type");
    365        ret = -errno;
    366        goto fail;
    367    }
    368
    369    iommu_info = g_malloc0(iommu_info_size);
    370    iommu_info->argsz = iommu_info_size;
    371
    372    /* Get additional IOMMU info */
    373    if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
    374        error_setg_errno(errp, errno, "Failed to get IOMMU info");
    375        ret = -errno;
    376        goto fail;
    377    }
    378
    379    /*
    380     * if the kernel does not report usable IOVA regions, choose
    381     * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
    382     */
    383    s->nb_iova_ranges = 1;
    384    s->usable_iova_ranges = g_new0(struct IOVARange, 1);
    385    s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
    386    s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
    387
    388    if (iommu_info->argsz > iommu_info_size) {
    389        iommu_info_size = iommu_info->argsz;
    390        iommu_info = g_realloc(iommu_info, iommu_info_size);
    391        if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
    392            ret = -errno;
    393            goto fail;
    394        }
    395        collect_usable_iova_ranges(s, iommu_info);
    396    }
    397
    398    s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
    399
    400    if (s->device < 0) {
    401        error_setg_errno(errp, errno, "Failed to get device fd");
    402        ret = -errno;
    403        goto fail;
    404    }
    405
    406    /* Test and setup the device */
    407    if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) {
    408        error_setg_errno(errp, errno, "Failed to get device info");
    409        ret = -errno;
    410        goto fail;
    411    }
    412
    413    if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
    414        error_setg(errp, "Invalid device regions");
    415        ret = -EINVAL;
    416        goto fail;
    417    }
    418
    419    s->config_region_info = (struct vfio_region_info) {
    420        .index = VFIO_PCI_CONFIG_REGION_INDEX,
    421        .argsz = sizeof(struct vfio_region_info),
    422    };
    423    if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) {
    424        error_setg_errno(errp, errno, "Failed to get config region info");
    425        ret = -errno;
    426        goto fail;
    427    }
    428    trace_qemu_vfio_region_info("config", s->config_region_info.offset,
    429                                s->config_region_info.size,
    430                                s->config_region_info.cap_offset);
    431
    432    for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) {
    433        ret = qemu_vfio_pci_init_bar(s, i, errp);
    434        if (ret) {
    435            goto fail;
    436        }
    437    }
    438
    439    /* Enable bus master */
    440    ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
    441    if (ret) {
    442        goto fail;
    443    }
    444    pci_cmd |= PCI_COMMAND_MASTER;
    445    ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
    446    if (ret) {
    447        goto fail;
    448    }
    449    g_free(iommu_info);
    450    return 0;
    451fail:
    452    g_free(s->usable_iova_ranges);
    453    s->usable_iova_ranges = NULL;
    454    s->nb_iova_ranges = 0;
    455    g_free(iommu_info);
    456    close(s->group);
    457fail_container:
    458    close(s->container);
    459    return ret;
    460}
    461
    462static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, void *host,
    463                                      size_t size, size_t max_size)
    464{
    465    QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
    466    Error *local_err = NULL;
    467    int ret;
    468
    469    trace_qemu_vfio_ram_block_added(s, host, max_size);
    470    ret = qemu_vfio_dma_map(s, host, max_size, false, NULL, &local_err);
    471    if (ret) {
    472        error_reportf_err(local_err,
    473                          "qemu_vfio_dma_map(%p, %zu) failed: ",
    474                          host, max_size);
    475    }
    476}
    477
    478static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, void *host,
    479                                        size_t size, size_t max_size)
    480{
    481    QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
    482    if (host) {
    483        trace_qemu_vfio_ram_block_removed(s, host, max_size);
    484        qemu_vfio_dma_unmap(s, host);
    485    }
    486}
    487
    488static void qemu_vfio_open_common(QEMUVFIOState *s)
    489{
    490    qemu_mutex_init(&s->lock);
    491    s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added;
    492    s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed;
    493    s->low_water_mark = QEMU_VFIO_IOVA_MIN;
    494    s->high_water_mark = QEMU_VFIO_IOVA_MAX;
    495    ram_block_notifier_add(&s->ram_notifier);
    496}
    497
    498/**
    499 * Open a PCI device, e.g. "0000:00:01.0".
    500 */
    501QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
    502{
    503    int r;
    504    QEMUVFIOState *s = g_new0(QEMUVFIOState, 1);
    505
    506    /*
    507     * VFIO may pin all memory inside mappings, resulting it in pinning
    508     * all memory inside RAM blocks unconditionally.
    509     */
    510    r = ram_block_discard_disable(true);
    511    if (r) {
    512        error_setg_errno(errp, -r, "Cannot set discarding of RAM broken");
    513        g_free(s);
    514        return NULL;
    515    }
    516
    517    r = qemu_vfio_init_pci(s, device, errp);
    518    if (r) {
    519        ram_block_discard_disable(false);
    520        g_free(s);
    521        return NULL;
    522    }
    523    qemu_vfio_open_common(s);
    524    return s;
    525}
    526
    527static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
    528{
    529    for (int i = 0; i < s->nr_mappings; ++i) {
    530        trace_qemu_vfio_dump_mapping(s->mappings[i].host,
    531                                     s->mappings[i].iova,
    532                                     s->mappings[i].size);
    533    }
    534}
    535
    536/**
    537 * Find the mapping entry that contains [host, host + size) and set @index to
    538 * the position. If no entry contains it, @index is the position _after_ which
    539 * to insert the new mapping. IOW, it is the index of the largest element that
    540 * is smaller than @host, or -1 if no entry is.
    541 */
    542static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host,
    543                                           int *index)
    544{
    545    IOVAMapping *p = s->mappings;
    546    IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL;
    547    IOVAMapping *mid;
    548    trace_qemu_vfio_find_mapping(s, host);
    549    if (!p) {
    550        *index = -1;
    551        return NULL;
    552    }
    553    while (true) {
    554        mid = p + (q - p) / 2;
    555        if (mid == p) {
    556            break;
    557        }
    558        if (mid->host > host) {
    559            q = mid;
    560        } else if (mid->host < host) {
    561            p = mid;
    562        } else {
    563            break;
    564        }
    565    }
    566    if (mid->host > host) {
    567        mid--;
    568    } else if (mid < &s->mappings[s->nr_mappings - 1]
    569               && (mid + 1)->host <= host) {
    570        mid++;
    571    }
    572    *index = mid - &s->mappings[0];
    573    if (mid >= &s->mappings[0] &&
    574        mid->host <= host && mid->host + mid->size > host) {
    575        assert(mid < &s->mappings[s->nr_mappings]);
    576        return mid;
    577    }
    578    /* At this point *index + 1 is the right position to insert the new
    579     * mapping.*/
    580    return NULL;
    581}
    582
    583/**
    584 * Allocate IOVA and create a new mapping record and insert it in @s.
    585 */
    586static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s,
    587                                          void *host, size_t size,
    588                                          int index, uint64_t iova)
    589{
    590    int shift;
    591    IOVAMapping m = {.host = host, .size = size, .iova = iova};
    592    IOVAMapping *insert;
    593
    594    assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size));
    595    assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size));
    596    assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size));
    597    trace_qemu_vfio_new_mapping(s, host, size, index, iova);
    598
    599    assert(index >= 0);
    600    s->nr_mappings++;
    601    s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
    602    insert = &s->mappings[index];
    603    shift = s->nr_mappings - index - 1;
    604    if (shift) {
    605        memmove(insert + 1, insert, shift * sizeof(s->mappings[0]));
    606    }
    607    *insert = m;
    608    return insert;
    609}
    610
    611/* Do the DMA mapping with VFIO. */
    612static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
    613                                uint64_t iova, Error **errp)
    614{
    615    struct vfio_iommu_type1_dma_map dma_map = {
    616        .argsz = sizeof(dma_map),
    617        .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
    618        .iova = iova,
    619        .vaddr = (uintptr_t)host,
    620        .size = size,
    621    };
    622    trace_qemu_vfio_do_mapping(s, host, iova, size);
    623
    624    if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
    625        error_setg_errno(errp, errno, "VFIO_MAP_DMA failed");
    626        return -errno;
    627    }
    628    return 0;
    629}
    630
    631/**
    632 * Undo the DMA mapping from @s with VFIO, and remove from mapping list.
    633 */
    634static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping,
    635                                   Error **errp)
    636{
    637    int index;
    638    struct vfio_iommu_type1_dma_unmap unmap = {
    639        .argsz = sizeof(unmap),
    640        .flags = 0,
    641        .iova = mapping->iova,
    642        .size = mapping->size,
    643    };
    644
    645    index = mapping - s->mappings;
    646    assert(mapping->size > 0);
    647    assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size));
    648    assert(index >= 0 && index < s->nr_mappings);
    649    if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
    650        error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed");
    651    }
    652    memmove(mapping, &s->mappings[index + 1],
    653            sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
    654    s->nr_mappings--;
    655    s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
    656}
    657
    658/* Check if the mapping list is (ascending) ordered. */
    659static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
    660{
    661    int i;
    662    if (QEMU_VFIO_DEBUG) {
    663        for (i = 0; i < s->nr_mappings - 1; ++i) {
    664            if (!(s->mappings[i].host < s->mappings[i + 1].host)) {
    665                error_report("item %d not sorted!", i);
    666                qemu_vfio_dump_mappings(s);
    667                return false;
    668            }
    669            if (!(s->mappings[i].host + s->mappings[i].size <=
    670                  s->mappings[i + 1].host)) {
    671                error_report("item %d overlap with next!", i);
    672                qemu_vfio_dump_mappings(s);
    673                return false;
    674            }
    675        }
    676    }
    677    return true;
    678}
    679
    680static bool qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size,
    681                                      uint64_t *iova, Error **errp)
    682{
    683    int i;
    684
    685    for (i = 0; i < s->nb_iova_ranges; i++) {
    686        if (s->usable_iova_ranges[i].end < s->low_water_mark) {
    687            continue;
    688        }
    689        s->low_water_mark =
    690            MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
    691
    692        if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
    693            s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
    694            *iova = s->low_water_mark;
    695            s->low_water_mark += size;
    696            return true;
    697        }
    698    }
    699    error_setg(errp, "fixed iova range not found");
    700
    701    return false;
    702}
    703
    704static bool qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size,
    705                                     uint64_t *iova, Error **errp)
    706{
    707    int i;
    708
    709    for (i = s->nb_iova_ranges - 1; i >= 0; i--) {
    710        if (s->usable_iova_ranges[i].start > s->high_water_mark) {
    711            continue;
    712        }
    713        s->high_water_mark =
    714            MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1);
    715
    716        if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size ||
    717            s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) {
    718            *iova = s->high_water_mark - size;
    719            s->high_water_mark = *iova;
    720            return true;
    721        }
    722    }
    723    error_setg(errp, "temporary iova range not found");
    724
    725    return false;
    726}
    727
    728/**
    729 * qemu_vfio_water_mark_reached:
    730 *
    731 * Returns %true if high watermark has been reached, %false otherwise.
    732 */
    733static bool qemu_vfio_water_mark_reached(QEMUVFIOState *s, size_t size,
    734                                         Error **errp)
    735{
    736    if (s->high_water_mark - s->low_water_mark + 1 < size) {
    737        error_setg(errp, "iova exhausted (water mark reached)");
    738        return true;
    739    }
    740    return false;
    741}
    742
    743/* Map [host, host + size) area into a contiguous IOVA address space, and store
    744 * the result in @iova if not NULL. The caller need to make sure the area is
    745 * aligned to page size, and mustn't overlap with existing mapping areas (split
    746 * mapping status within this area is not allowed).
    747 */
    748int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
    749                      bool temporary, uint64_t *iova, Error **errp)
    750{
    751    int index;
    752    IOVAMapping *mapping;
    753    uint64_t iova0;
    754
    755    assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size));
    756    assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size));
    757    trace_qemu_vfio_dma_map(s, host, size, temporary, iova);
    758    QEMU_LOCK_GUARD(&s->lock);
    759    mapping = qemu_vfio_find_mapping(s, host, &index);
    760    if (mapping) {
    761        iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host);
    762    } else {
    763        int ret;
    764
    765        if (qemu_vfio_water_mark_reached(s, size, errp)) {
    766            return -ENOMEM;
    767        }
    768        if (!temporary) {
    769            if (!qemu_vfio_find_fixed_iova(s, size, &iova0, errp)) {
    770                return -ENOMEM;
    771            }
    772
    773            mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
    774            assert(qemu_vfio_verify_mappings(s));
    775            ret = qemu_vfio_do_mapping(s, host, size, iova0, errp);
    776            if (ret < 0) {
    777                qemu_vfio_undo_mapping(s, mapping, NULL);
    778                return ret;
    779            }
    780            qemu_vfio_dump_mappings(s);
    781        } else {
    782            if (!qemu_vfio_find_temp_iova(s, size, &iova0, errp)) {
    783                return -ENOMEM;
    784            }
    785            ret = qemu_vfio_do_mapping(s, host, size, iova0, errp);
    786            if (ret < 0) {
    787                return ret;
    788            }
    789        }
    790    }
    791    trace_qemu_vfio_dma_mapped(s, host, iova0, size);
    792    if (iova) {
    793        *iova = iova0;
    794    }
    795    return 0;
    796}
    797
    798/* Reset the high watermark and free all "temporary" mappings. */
    799int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
    800{
    801    struct vfio_iommu_type1_dma_unmap unmap = {
    802        .argsz = sizeof(unmap),
    803        .flags = 0,
    804        .iova = s->high_water_mark,
    805        .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark,
    806    };
    807    trace_qemu_vfio_dma_reset_temporary(s);
    808    QEMU_LOCK_GUARD(&s->lock);
    809    if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
    810        error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
    811        return -errno;
    812    }
    813    s->high_water_mark = QEMU_VFIO_IOVA_MAX;
    814    return 0;
    815}
    816
    817/* Unmapping the whole area that was previously mapped with
    818 * qemu_vfio_dma_map(). */
    819void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host)
    820{
    821    int index = 0;
    822    IOVAMapping *m;
    823
    824    if (!host) {
    825        return;
    826    }
    827
    828    trace_qemu_vfio_dma_unmap(s, host);
    829    QEMU_LOCK_GUARD(&s->lock);
    830    m = qemu_vfio_find_mapping(s, host, &index);
    831    if (!m) {
    832        return;
    833    }
    834    qemu_vfio_undo_mapping(s, m, NULL);
    835}
    836
    837static void qemu_vfio_reset(QEMUVFIOState *s)
    838{
    839    ioctl(s->device, VFIO_DEVICE_RESET);
    840}
    841
    842/* Close and free the VFIO resources. */
    843void qemu_vfio_close(QEMUVFIOState *s)
    844{
    845    int i;
    846
    847    if (!s) {
    848        return;
    849    }
    850    for (i = 0; i < s->nr_mappings; ++i) {
    851        qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
    852    }
    853    ram_block_notifier_remove(&s->ram_notifier);
    854    g_free(s->usable_iova_ranges);
    855    s->nb_iova_ranges = 0;
    856    qemu_vfio_reset(s);
    857    close(s->device);
    858    close(s->group);
    859    close(s->container);
    860    ram_block_discard_disable(false);
    861}