cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

vdi.c (35440B)


      1/*
      2 * Block driver for the Virtual Disk Image (VDI) format
      3 *
      4 * Copyright (c) 2009, 2012 Stefan Weil
      5 *
      6 * This program is free software: you can redistribute it and/or modify
      7 * it under the terms of the GNU General Public License as published by
      8 * the Free Software Foundation, either version 2 of the License, or
      9 * (at your option) version 3 or any later version.
     10 *
     11 * This program is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 * GNU General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU General Public License
     17 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
     18 *
     19 * Reference:
     20 * http://forums.virtualbox.org/viewtopic.php?t=8046
     21 *
     22 * This driver supports create / read / write operations on VDI images.
     23 *
     24 * Todo (see also TODO in code):
     25 *
     26 * Some features like snapshots are still missing.
     27 *
     28 * Deallocation of zero-filled blocks and shrinking images are missing, too
     29 * (might be added to common block layer).
     30 *
     31 * Allocation of blocks could be optimized (less writes to block map and
     32 * header).
     33 *
     34 * Read and write of adjacent blocks could be done in one operation
     35 * (current code uses one operation per block (1 MiB).
     36 *
     37 * The code is not thread safe (missing locks for changes in header and
     38 * block table, no problem with current QEMU).
     39 *
     40 * Hints:
     41 *
     42 * Blocks (VDI documentation) correspond to clusters (QEMU).
     43 * QEMU's backing files could be implemented using VDI snapshot files (TODO).
     44 * VDI snapshot files may also contain the complete machine state.
     45 * Maybe this machine state can be converted to QEMU PC machine snapshot data.
     46 *
     47 * The driver keeps a block cache (little endian entries) in memory.
     48 * For the standard block size (1 MiB), a 1 TiB disk will use 4 MiB RAM,
     49 * so this seems to be reasonable.
     50 */
     51
     52#include "qemu/osdep.h"
     53#include "qemu/units.h"
     54#include "qapi/error.h"
     55#include "qapi/qobject-input-visitor.h"
     56#include "qapi/qapi-visit-block-core.h"
     57#include "block/block_int.h"
     58#include "block/qdict.h"
     59#include "sysemu/block-backend.h"
     60#include "qemu/module.h"
     61#include "qemu/option.h"
     62#include "qemu/bswap.h"
     63#include "migration/blocker.h"
     64#include "qemu/coroutine.h"
     65#include "qemu/cutils.h"
     66#include "qemu/uuid.h"
     67
     68/* Code configuration options. */
     69
     70/* Enable debug messages. */
     71//~ #define CONFIG_VDI_DEBUG
     72
     73/* Support write operations on VDI images. */
     74#define CONFIG_VDI_WRITE
     75
     76/* Support non-standard block (cluster) size. This is untested.
     77 * Maybe it will be needed for very large images.
     78 */
     79//~ #define CONFIG_VDI_BLOCK_SIZE
     80
     81/* Support static (fixed, pre-allocated) images. */
     82#define CONFIG_VDI_STATIC_IMAGE
     83
     84/* Command line option for static images. */
     85#define BLOCK_OPT_STATIC "static"
     86
     87#define SECTOR_SIZE 512
     88#define DEFAULT_CLUSTER_SIZE 1048576
     89/* Note: can't use 1 * MiB, because it's passed to stringify() */
     90
     91#if defined(CONFIG_VDI_DEBUG)
     92#define VDI_DEBUG 1
     93#else
     94#define VDI_DEBUG 0
     95#endif
     96
     97#define logout(fmt, ...) \
     98    do {                                                                \
     99        if (VDI_DEBUG) {                                                \
    100            fprintf(stderr, "vdi\t%-24s" fmt, __func__, ##__VA_ARGS__); \
    101        }                                                               \
    102    } while (0)
    103
    104/* Image signature. */
    105#define VDI_SIGNATURE 0xbeda107f
    106
    107/* Image version. */
    108#define VDI_VERSION_1_1 0x00010001
    109
    110/* Image type. */
    111#define VDI_TYPE_DYNAMIC 1
    112#define VDI_TYPE_STATIC  2
    113
    114/* Innotek / SUN images use these strings in header.text:
    115 * "<<< innotek VirtualBox Disk Image >>>\n"
    116 * "<<< Sun xVM VirtualBox Disk Image >>>\n"
    117 * "<<< Sun VirtualBox Disk Image >>>\n"
    118 * The value does not matter, so QEMU created images use a different text.
    119 */
    120#define VDI_TEXT "<<< QEMU VM Virtual Disk Image >>>\n"
    121
    122/* A never-allocated block; semantically arbitrary content. */
    123#define VDI_UNALLOCATED 0xffffffffU
    124
    125/* A discarded (no longer allocated) block; semantically zero-filled. */
    126#define VDI_DISCARDED   0xfffffffeU
    127
    128#define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED)
    129
    130/* The bmap will take up VDI_BLOCKS_IN_IMAGE_MAX * sizeof(uint32_t) bytes; since
    131 * the bmap is read and written in a single operation, its size needs to be
    132 * limited to INT_MAX; furthermore, when opening an image, the bmap size is
    133 * rounded up to be aligned on BDRV_SECTOR_SIZE.
    134 * Therefore this should satisfy the following:
    135 * VDI_BLOCKS_IN_IMAGE_MAX * sizeof(uint32_t) + BDRV_SECTOR_SIZE == INT_MAX + 1
    136 * (INT_MAX + 1 is the first value not representable as an int)
    137 * This guarantees that any value below or equal to the constant will, when
    138 * multiplied by sizeof(uint32_t) and rounded up to a BDRV_SECTOR_SIZE boundary,
    139 * still be below or equal to INT_MAX. */
    140#define VDI_BLOCKS_IN_IMAGE_MAX \
    141    ((unsigned)((INT_MAX + 1u - BDRV_SECTOR_SIZE) / sizeof(uint32_t)))
    142#define VDI_DISK_SIZE_MAX        ((uint64_t)VDI_BLOCKS_IN_IMAGE_MAX * \
    143                                  (uint64_t)DEFAULT_CLUSTER_SIZE)
    144
    145static QemuOptsList vdi_create_opts;
    146
    147typedef struct {
    148    char text[0x40];
    149    uint32_t signature;
    150    uint32_t version;
    151    uint32_t header_size;
    152    uint32_t image_type;
    153    uint32_t image_flags;
    154    char description[256];
    155    uint32_t offset_bmap;
    156    uint32_t offset_data;
    157    uint32_t cylinders;         /* disk geometry, unused here */
    158    uint32_t heads;             /* disk geometry, unused here */
    159    uint32_t sectors;           /* disk geometry, unused here */
    160    uint32_t sector_size;
    161    uint32_t unused1;
    162    uint64_t disk_size;
    163    uint32_t block_size;
    164    uint32_t block_extra;       /* unused here */
    165    uint32_t blocks_in_image;
    166    uint32_t blocks_allocated;
    167    QemuUUID uuid_image;
    168    QemuUUID uuid_last_snap;
    169    QemuUUID uuid_link;
    170    QemuUUID uuid_parent;
    171    uint64_t unused2[7];
    172} QEMU_PACKED VdiHeader;
    173
    174QEMU_BUILD_BUG_ON(sizeof(VdiHeader) != 512);
    175
    176typedef struct {
    177    /* The block map entries are little endian (even in memory). */
    178    uint32_t *bmap;
    179    /* Size of block (bytes). */
    180    uint32_t block_size;
    181    /* First sector of block map. */
    182    uint32_t bmap_sector;
    183    /* VDI header (converted to host endianness). */
    184    VdiHeader header;
    185
    186    CoRwlock bmap_lock;
    187
    188    Error *migration_blocker;
    189} BDRVVdiState;
    190
    191static void vdi_header_to_cpu(VdiHeader *header)
    192{
    193    header->signature = le32_to_cpu(header->signature);
    194    header->version = le32_to_cpu(header->version);
    195    header->header_size = le32_to_cpu(header->header_size);
    196    header->image_type = le32_to_cpu(header->image_type);
    197    header->image_flags = le32_to_cpu(header->image_flags);
    198    header->offset_bmap = le32_to_cpu(header->offset_bmap);
    199    header->offset_data = le32_to_cpu(header->offset_data);
    200    header->cylinders = le32_to_cpu(header->cylinders);
    201    header->heads = le32_to_cpu(header->heads);
    202    header->sectors = le32_to_cpu(header->sectors);
    203    header->sector_size = le32_to_cpu(header->sector_size);
    204    header->disk_size = le64_to_cpu(header->disk_size);
    205    header->block_size = le32_to_cpu(header->block_size);
    206    header->block_extra = le32_to_cpu(header->block_extra);
    207    header->blocks_in_image = le32_to_cpu(header->blocks_in_image);
    208    header->blocks_allocated = le32_to_cpu(header->blocks_allocated);
    209    header->uuid_image = qemu_uuid_bswap(header->uuid_image);
    210    header->uuid_last_snap = qemu_uuid_bswap(header->uuid_last_snap);
    211    header->uuid_link = qemu_uuid_bswap(header->uuid_link);
    212    header->uuid_parent = qemu_uuid_bswap(header->uuid_parent);
    213}
    214
    215static void vdi_header_to_le(VdiHeader *header)
    216{
    217    header->signature = cpu_to_le32(header->signature);
    218    header->version = cpu_to_le32(header->version);
    219    header->header_size = cpu_to_le32(header->header_size);
    220    header->image_type = cpu_to_le32(header->image_type);
    221    header->image_flags = cpu_to_le32(header->image_flags);
    222    header->offset_bmap = cpu_to_le32(header->offset_bmap);
    223    header->offset_data = cpu_to_le32(header->offset_data);
    224    header->cylinders = cpu_to_le32(header->cylinders);
    225    header->heads = cpu_to_le32(header->heads);
    226    header->sectors = cpu_to_le32(header->sectors);
    227    header->sector_size = cpu_to_le32(header->sector_size);
    228    header->disk_size = cpu_to_le64(header->disk_size);
    229    header->block_size = cpu_to_le32(header->block_size);
    230    header->block_extra = cpu_to_le32(header->block_extra);
    231    header->blocks_in_image = cpu_to_le32(header->blocks_in_image);
    232    header->blocks_allocated = cpu_to_le32(header->blocks_allocated);
    233    header->uuid_image = qemu_uuid_bswap(header->uuid_image);
    234    header->uuid_last_snap = qemu_uuid_bswap(header->uuid_last_snap);
    235    header->uuid_link = qemu_uuid_bswap(header->uuid_link);
    236    header->uuid_parent = qemu_uuid_bswap(header->uuid_parent);
    237}
    238
    239static void vdi_header_print(VdiHeader *header)
    240{
    241    char uuidstr[37];
    242    QemuUUID uuid;
    243    logout("text        %s", header->text);
    244    logout("signature   0x%08x\n", header->signature);
    245    logout("header size 0x%04x\n", header->header_size);
    246    logout("image type  0x%04x\n", header->image_type);
    247    logout("image flags 0x%04x\n", header->image_flags);
    248    logout("description %s\n", header->description);
    249    logout("offset bmap 0x%04x\n", header->offset_bmap);
    250    logout("offset data 0x%04x\n", header->offset_data);
    251    logout("cylinders   0x%04x\n", header->cylinders);
    252    logout("heads       0x%04x\n", header->heads);
    253    logout("sectors     0x%04x\n", header->sectors);
    254    logout("sector size 0x%04x\n", header->sector_size);
    255    logout("image size  0x%" PRIx64 " B (%" PRIu64 " MiB)\n",
    256           header->disk_size, header->disk_size / MiB);
    257    logout("block size  0x%04x\n", header->block_size);
    258    logout("block extra 0x%04x\n", header->block_extra);
    259    logout("blocks tot. 0x%04x\n", header->blocks_in_image);
    260    logout("blocks all. 0x%04x\n", header->blocks_allocated);
    261    uuid = header->uuid_image;
    262    qemu_uuid_unparse(&uuid, uuidstr);
    263    logout("uuid image  %s\n", uuidstr);
    264    uuid = header->uuid_last_snap;
    265    qemu_uuid_unparse(&uuid, uuidstr);
    266    logout("uuid snap   %s\n", uuidstr);
    267    uuid = header->uuid_link;
    268    qemu_uuid_unparse(&uuid, uuidstr);
    269    logout("uuid link   %s\n", uuidstr);
    270    uuid = header->uuid_parent;
    271    qemu_uuid_unparse(&uuid, uuidstr);
    272    logout("uuid parent %s\n", uuidstr);
    273}
    274
    275static int coroutine_fn vdi_co_check(BlockDriverState *bs, BdrvCheckResult *res,
    276                                     BdrvCheckMode fix)
    277{
    278    /* TODO: additional checks possible. */
    279    BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
    280    uint32_t blocks_allocated = 0;
    281    uint32_t block;
    282    uint32_t *bmap;
    283    logout("\n");
    284
    285    if (fix) {
    286        return -ENOTSUP;
    287    }
    288
    289    bmap = g_try_new(uint32_t, s->header.blocks_in_image);
    290    if (s->header.blocks_in_image && bmap == NULL) {
    291        res->check_errors++;
    292        return -ENOMEM;
    293    }
    294
    295    memset(bmap, 0xff, s->header.blocks_in_image * sizeof(uint32_t));
    296
    297    /* Check block map and value of blocks_allocated. */
    298    for (block = 0; block < s->header.blocks_in_image; block++) {
    299        uint32_t bmap_entry = le32_to_cpu(s->bmap[block]);
    300        if (VDI_IS_ALLOCATED(bmap_entry)) {
    301            if (bmap_entry < s->header.blocks_in_image) {
    302                blocks_allocated++;
    303                if (!VDI_IS_ALLOCATED(bmap[bmap_entry])) {
    304                    bmap[bmap_entry] = bmap_entry;
    305                } else {
    306                    fprintf(stderr, "ERROR: block index %" PRIu32
    307                            " also used by %" PRIu32 "\n", bmap[bmap_entry], bmap_entry);
    308                    res->corruptions++;
    309                }
    310            } else {
    311                fprintf(stderr, "ERROR: block index %" PRIu32
    312                        " too large, is %" PRIu32 "\n", block, bmap_entry);
    313                res->corruptions++;
    314            }
    315        }
    316    }
    317    if (blocks_allocated != s->header.blocks_allocated) {
    318        fprintf(stderr, "ERROR: allocated blocks mismatch, is %" PRIu32
    319               ", should be %" PRIu32 "\n",
    320               blocks_allocated, s->header.blocks_allocated);
    321        res->corruptions++;
    322    }
    323
    324    g_free(bmap);
    325
    326    return 0;
    327}
    328
    329static int vdi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    330{
    331    /* TODO: vdi_get_info would be needed for machine snapshots.
    332       vm_state_offset is still missing. */
    333    BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
    334    logout("\n");
    335    bdi->cluster_size = s->block_size;
    336    bdi->vm_state_offset = 0;
    337    return 0;
    338}
    339
    340static int vdi_make_empty(BlockDriverState *bs)
    341{
    342    /* TODO: missing code. */
    343    logout("\n");
    344    /* The return value for missing code must be 0, see block.c. */
    345    return 0;
    346}
    347
    348static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename)
    349{
    350    const VdiHeader *header = (const VdiHeader *)buf;
    351    int ret = 0;
    352
    353    logout("\n");
    354
    355    if (buf_size < sizeof(*header)) {
    356        /* Header too small, no VDI. */
    357    } else if (le32_to_cpu(header->signature) == VDI_SIGNATURE) {
    358        ret = 100;
    359    }
    360
    361    if (ret == 0) {
    362        logout("no vdi image\n");
    363    } else {
    364        logout("%s", header->text);
    365    }
    366
    367    return ret;
    368}
    369
    370static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
    371                    Error **errp)
    372{
    373    BDRVVdiState *s = bs->opaque;
    374    VdiHeader header;
    375    size_t bmap_size;
    376    int ret;
    377    QemuUUID uuid_link, uuid_parent;
    378
    379    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
    380                               BDRV_CHILD_IMAGE, false, errp);
    381    if (!bs->file) {
    382        return -EINVAL;
    383    }
    384
    385    logout("\n");
    386
    387    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
    388    if (ret < 0) {
    389        goto fail;
    390    }
    391
    392    vdi_header_to_cpu(&header);
    393    if (VDI_DEBUG) {
    394        vdi_header_print(&header);
    395    }
    396
    397    if (header.disk_size > VDI_DISK_SIZE_MAX) {
    398        error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64
    399                          ", max supported is 0x%" PRIx64 ")",
    400                          header.disk_size, VDI_DISK_SIZE_MAX);
    401        ret = -ENOTSUP;
    402        goto fail;
    403    }
    404
    405    uuid_link = header.uuid_link;
    406    uuid_parent = header.uuid_parent;
    407
    408    if (header.disk_size % SECTOR_SIZE != 0) {
    409        /* 'VBoxManage convertfromraw' can create images with odd disk sizes.
    410           We accept them but round the disk size to the next multiple of
    411           SECTOR_SIZE. */
    412        logout("odd disk size %" PRIu64 " B, round up\n", header.disk_size);
    413        header.disk_size = ROUND_UP(header.disk_size, SECTOR_SIZE);
    414    }
    415
    416    if (header.signature != VDI_SIGNATURE) {
    417        error_setg(errp, "Image not in VDI format (bad signature %08" PRIx32
    418                   ")", header.signature);
    419        ret = -EINVAL;
    420        goto fail;
    421    } else if (header.version != VDI_VERSION_1_1) {
    422        error_setg(errp, "unsupported VDI image (version %" PRIu32 ".%" PRIu32
    423                   ")", header.version >> 16, header.version & 0xffff);
    424        ret = -ENOTSUP;
    425        goto fail;
    426    } else if (header.offset_bmap % SECTOR_SIZE != 0) {
    427        /* We only support block maps which start on a sector boundary. */
    428        error_setg(errp, "unsupported VDI image (unaligned block map offset "
    429                   "0x%" PRIx32 ")", header.offset_bmap);
    430        ret = -ENOTSUP;
    431        goto fail;
    432    } else if (header.offset_data % SECTOR_SIZE != 0) {
    433        /* We only support data blocks which start on a sector boundary. */
    434        error_setg(errp, "unsupported VDI image (unaligned data offset 0x%"
    435                   PRIx32 ")", header.offset_data);
    436        ret = -ENOTSUP;
    437        goto fail;
    438    } else if (header.sector_size != SECTOR_SIZE) {
    439        error_setg(errp, "unsupported VDI image (sector size %" PRIu32
    440                   " is not %u)", header.sector_size, SECTOR_SIZE);
    441        ret = -ENOTSUP;
    442        goto fail;
    443    } else if (header.block_size != DEFAULT_CLUSTER_SIZE) {
    444        error_setg(errp, "unsupported VDI image (block size %" PRIu32
    445                         " is not %" PRIu32 ")",
    446                   header.block_size, DEFAULT_CLUSTER_SIZE);
    447        ret = -ENOTSUP;
    448        goto fail;
    449    } else if (header.disk_size >
    450               (uint64_t)header.blocks_in_image * header.block_size) {
    451        error_setg(errp, "unsupported VDI image (disk size %" PRIu64 ", "
    452                   "image bitmap has room for %" PRIu64 ")",
    453                   header.disk_size,
    454                   (uint64_t)header.blocks_in_image * header.block_size);
    455        ret = -ENOTSUP;
    456        goto fail;
    457    } else if (!qemu_uuid_is_null(&uuid_link)) {
    458        error_setg(errp, "unsupported VDI image (non-NULL link UUID)");
    459        ret = -ENOTSUP;
    460        goto fail;
    461    } else if (!qemu_uuid_is_null(&uuid_parent)) {
    462        error_setg(errp, "unsupported VDI image (non-NULL parent UUID)");
    463        ret = -ENOTSUP;
    464        goto fail;
    465    } else if (header.blocks_in_image > VDI_BLOCKS_IN_IMAGE_MAX) {
    466        error_setg(errp, "unsupported VDI image "
    467                         "(too many blocks %u, max is %u)",
    468                          header.blocks_in_image, VDI_BLOCKS_IN_IMAGE_MAX);
    469        ret = -ENOTSUP;
    470        goto fail;
    471    }
    472
    473    bs->total_sectors = header.disk_size / SECTOR_SIZE;
    474
    475    s->block_size = header.block_size;
    476    s->bmap_sector = header.offset_bmap / SECTOR_SIZE;
    477    s->header = header;
    478
    479    bmap_size = header.blocks_in_image * sizeof(uint32_t);
    480    bmap_size = DIV_ROUND_UP(bmap_size, SECTOR_SIZE);
    481    s->bmap = qemu_try_blockalign(bs->file->bs, bmap_size * SECTOR_SIZE);
    482    if (s->bmap == NULL) {
    483        ret = -ENOMEM;
    484        goto fail;
    485    }
    486
    487    ret = bdrv_pread(bs->file, header.offset_bmap, s->bmap,
    488                     bmap_size * SECTOR_SIZE);
    489    if (ret < 0) {
    490        goto fail_free_bmap;
    491    }
    492
    493    /* Disable migration when vdi images are used */
    494    error_setg(&s->migration_blocker, "The vdi format used by node '%s' "
    495               "does not support live migration",
    496               bdrv_get_device_or_node_name(bs));
    497    ret = migrate_add_blocker(s->migration_blocker, errp);
    498    if (ret < 0) {
    499        error_free(s->migration_blocker);
    500        goto fail_free_bmap;
    501    }
    502
    503    qemu_co_rwlock_init(&s->bmap_lock);
    504
    505    return 0;
    506
    507 fail_free_bmap:
    508    qemu_vfree(s->bmap);
    509
    510 fail:
    511    return ret;
    512}
    513
    514static int vdi_reopen_prepare(BDRVReopenState *state,
    515                              BlockReopenQueue *queue, Error **errp)
    516{
    517    return 0;
    518}
    519
    520static int coroutine_fn vdi_co_block_status(BlockDriverState *bs,
    521                                            bool want_zero,
    522                                            int64_t offset, int64_t bytes,
    523                                            int64_t *pnum, int64_t *map,
    524                                            BlockDriverState **file)
    525{
    526    BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
    527    size_t bmap_index = offset / s->block_size;
    528    size_t index_in_block = offset % s->block_size;
    529    uint32_t bmap_entry = le32_to_cpu(s->bmap[bmap_index]);
    530    int result;
    531
    532    logout("%p, %" PRId64 ", %" PRId64 ", %p\n", bs, offset, bytes, pnum);
    533    *pnum = MIN(s->block_size - index_in_block, bytes);
    534    result = VDI_IS_ALLOCATED(bmap_entry);
    535    if (!result) {
    536        return BDRV_BLOCK_ZERO;
    537    }
    538
    539    *map = s->header.offset_data + (uint64_t)bmap_entry * s->block_size +
    540        index_in_block;
    541    *file = bs->file->bs;
    542    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID |
    543        (s->header.image_type == VDI_TYPE_STATIC ? BDRV_BLOCK_RECURSE : 0);
    544}
    545
    546static int coroutine_fn
    547vdi_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
    548              QEMUIOVector *qiov, BdrvRequestFlags flags)
    549{
    550    BDRVVdiState *s = bs->opaque;
    551    QEMUIOVector local_qiov;
    552    uint32_t bmap_entry;
    553    uint32_t block_index;
    554    uint32_t offset_in_block;
    555    uint32_t n_bytes;
    556    uint64_t bytes_done = 0;
    557    int ret = 0;
    558
    559    logout("\n");
    560
    561    qemu_iovec_init(&local_qiov, qiov->niov);
    562
    563    while (ret >= 0 && bytes > 0) {
    564        block_index = offset / s->block_size;
    565        offset_in_block = offset % s->block_size;
    566        n_bytes = MIN(bytes, s->block_size - offset_in_block);
    567
    568        logout("will read %u bytes starting at offset %" PRIu64 "\n",
    569               n_bytes, offset);
    570
    571        /* prepare next AIO request */
    572        qemu_co_rwlock_rdlock(&s->bmap_lock);
    573        bmap_entry = le32_to_cpu(s->bmap[block_index]);
    574        qemu_co_rwlock_unlock(&s->bmap_lock);
    575        if (!VDI_IS_ALLOCATED(bmap_entry)) {
    576            /* Block not allocated, return zeros, no need to wait. */
    577            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
    578            ret = 0;
    579        } else {
    580            uint64_t data_offset = s->header.offset_data +
    581                                   (uint64_t)bmap_entry * s->block_size +
    582                                   offset_in_block;
    583
    584            qemu_iovec_reset(&local_qiov);
    585            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
    586
    587            ret = bdrv_co_preadv(bs->file, data_offset, n_bytes,
    588                                 &local_qiov, 0);
    589        }
    590        logout("%u bytes read\n", n_bytes);
    591
    592        bytes -= n_bytes;
    593        offset += n_bytes;
    594        bytes_done += n_bytes;
    595    }
    596
    597    qemu_iovec_destroy(&local_qiov);
    598
    599    return ret;
    600}
    601
    602static int coroutine_fn
    603vdi_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
    604               QEMUIOVector *qiov, BdrvRequestFlags flags)
    605{
    606    BDRVVdiState *s = bs->opaque;
    607    QEMUIOVector local_qiov;
    608    uint32_t bmap_entry;
    609    uint32_t block_index;
    610    uint32_t offset_in_block;
    611    uint32_t n_bytes;
    612    uint64_t data_offset;
    613    uint32_t bmap_first = VDI_UNALLOCATED;
    614    uint32_t bmap_last = VDI_UNALLOCATED;
    615    uint8_t *block = NULL;
    616    uint64_t bytes_done = 0;
    617    int ret = 0;
    618
    619    logout("\n");
    620
    621    qemu_iovec_init(&local_qiov, qiov->niov);
    622
    623    while (ret >= 0 && bytes > 0) {
    624        block_index = offset / s->block_size;
    625        offset_in_block = offset % s->block_size;
    626        n_bytes = MIN(bytes, s->block_size - offset_in_block);
    627
    628        logout("will write %u bytes starting at offset %" PRIu64 "\n",
    629               n_bytes, offset);
    630
    631        /* prepare next AIO request */
    632        qemu_co_rwlock_rdlock(&s->bmap_lock);
    633        bmap_entry = le32_to_cpu(s->bmap[block_index]);
    634        if (!VDI_IS_ALLOCATED(bmap_entry)) {
    635            /* Allocate new block and write to it. */
    636            uint64_t data_offset;
    637            qemu_co_rwlock_upgrade(&s->bmap_lock);
    638            bmap_entry = le32_to_cpu(s->bmap[block_index]);
    639            if (VDI_IS_ALLOCATED(bmap_entry)) {
    640                /* A concurrent allocation did the work for us.  */
    641                qemu_co_rwlock_downgrade(&s->bmap_lock);
    642                goto nonallocating_write;
    643            }
    644
    645            bmap_entry = s->header.blocks_allocated;
    646            s->bmap[block_index] = cpu_to_le32(bmap_entry);
    647            s->header.blocks_allocated++;
    648            data_offset = s->header.offset_data +
    649                          (uint64_t)bmap_entry * s->block_size;
    650            if (block == NULL) {
    651                block = g_malloc(s->block_size);
    652                bmap_first = block_index;
    653            }
    654            bmap_last = block_index;
    655            /* Copy data to be written to new block and zero unused parts. */
    656            memset(block, 0, offset_in_block);
    657            qemu_iovec_to_buf(qiov, bytes_done, block + offset_in_block,
    658                              n_bytes);
    659            memset(block + offset_in_block + n_bytes, 0,
    660                   s->block_size - n_bytes - offset_in_block);
    661
    662            /* Write the new block under CoRwLock write-side protection,
    663             * so this full-cluster write does not overlap a partial write
    664             * of the same cluster, issued from the "else" branch.
    665             */
    666            ret = bdrv_pwrite(bs->file, data_offset, block, s->block_size);
    667            qemu_co_rwlock_unlock(&s->bmap_lock);
    668        } else {
    669nonallocating_write:
    670            data_offset = s->header.offset_data +
    671                           (uint64_t)bmap_entry * s->block_size +
    672                           offset_in_block;
    673            qemu_co_rwlock_unlock(&s->bmap_lock);
    674
    675            qemu_iovec_reset(&local_qiov);
    676            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
    677
    678            ret = bdrv_co_pwritev(bs->file, data_offset, n_bytes,
    679                                  &local_qiov, 0);
    680        }
    681
    682        bytes -= n_bytes;
    683        offset += n_bytes;
    684        bytes_done += n_bytes;
    685
    686        logout("%u bytes written\n", n_bytes);
    687    }
    688
    689    qemu_iovec_destroy(&local_qiov);
    690
    691    logout("finished data write\n");
    692    if (ret < 0) {
    693        g_free(block);
    694        return ret;
    695    }
    696
    697    if (block) {
    698        /* One or more new blocks were allocated. */
    699        VdiHeader *header;
    700        uint8_t *base;
    701        uint64_t offset;
    702        uint32_t n_sectors;
    703
    704        g_free(block);
    705        header = g_malloc(sizeof(*header));
    706
    707        logout("now writing modified header\n");
    708        assert(VDI_IS_ALLOCATED(bmap_first));
    709        *header = s->header;
    710        vdi_header_to_le(header);
    711        ret = bdrv_pwrite(bs->file, 0, header, sizeof(*header));
    712        g_free(header);
    713
    714        if (ret < 0) {
    715            return ret;
    716        }
    717
    718        logout("now writing modified block map entry %u...%u\n",
    719               bmap_first, bmap_last);
    720        /* Write modified sectors from block map. */
    721        bmap_first /= (SECTOR_SIZE / sizeof(uint32_t));
    722        bmap_last /= (SECTOR_SIZE / sizeof(uint32_t));
    723        n_sectors = bmap_last - bmap_first + 1;
    724        offset = s->bmap_sector + bmap_first;
    725        base = ((uint8_t *)&s->bmap[0]) + bmap_first * SECTOR_SIZE;
    726        logout("will write %u block map sectors starting from entry %u\n",
    727               n_sectors, bmap_first);
    728        ret = bdrv_pwrite(bs->file, offset * SECTOR_SIZE, base,
    729                          n_sectors * SECTOR_SIZE);
    730    }
    731
    732    return ret < 0 ? ret : 0;
    733}
    734
    735static int coroutine_fn vdi_co_do_create(BlockdevCreateOptions *create_options,
    736                                         size_t block_size, Error **errp)
    737{
    738    BlockdevCreateOptionsVdi *vdi_opts;
    739    int ret = 0;
    740    uint64_t bytes = 0;
    741    uint32_t blocks;
    742    uint32_t image_type;
    743    VdiHeader header;
    744    size_t i;
    745    size_t bmap_size;
    746    int64_t offset = 0;
    747    BlockDriverState *bs_file = NULL;
    748    BlockBackend *blk = NULL;
    749    uint32_t *bmap = NULL;
    750    QemuUUID uuid;
    751
    752    assert(create_options->driver == BLOCKDEV_DRIVER_VDI);
    753    vdi_opts = &create_options->u.vdi;
    754
    755    logout("\n");
    756
    757    /* Validate options and set default values */
    758    bytes = vdi_opts->size;
    759
    760    if (!vdi_opts->has_preallocation) {
    761        vdi_opts->preallocation = PREALLOC_MODE_OFF;
    762    }
    763    switch (vdi_opts->preallocation) {
    764    case PREALLOC_MODE_OFF:
    765        image_type = VDI_TYPE_DYNAMIC;
    766        break;
    767    case PREALLOC_MODE_METADATA:
    768        image_type = VDI_TYPE_STATIC;
    769        break;
    770    default:
    771        error_setg(errp, "Preallocation mode not supported for vdi");
    772        return -EINVAL;
    773    }
    774
    775#ifndef CONFIG_VDI_STATIC_IMAGE
    776    if (image_type == VDI_TYPE_STATIC) {
    777        ret = -ENOTSUP;
    778        error_setg(errp, "Statically allocated images cannot be created in "
    779                   "this build");
    780        goto exit;
    781    }
    782#endif
    783#ifndef CONFIG_VDI_BLOCK_SIZE
    784    if (block_size != DEFAULT_CLUSTER_SIZE) {
    785        ret = -ENOTSUP;
    786        error_setg(errp,
    787                   "A non-default cluster size is not supported in this build");
    788        goto exit;
    789    }
    790#endif
    791
    792    if (bytes > VDI_DISK_SIZE_MAX) {
    793        ret = -ENOTSUP;
    794        error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64
    795                          ", max supported is 0x%" PRIx64 ")",
    796                          bytes, VDI_DISK_SIZE_MAX);
    797        goto exit;
    798    }
    799
    800    /* Create BlockBackend to write to the image */
    801    bs_file = bdrv_open_blockdev_ref(vdi_opts->file, errp);
    802    if (!bs_file) {
    803        ret = -EIO;
    804        goto exit;
    805    }
    806
    807    blk = blk_new_with_bs(bs_file, BLK_PERM_WRITE | BLK_PERM_RESIZE,
    808                          BLK_PERM_ALL, errp);
    809    if (!blk) {
    810        ret = -EPERM;
    811        goto exit;
    812    }
    813
    814    blk_set_allow_write_beyond_eof(blk, true);
    815
    816    /* We need enough blocks to store the given disk size,
    817       so always round up. */
    818    blocks = DIV_ROUND_UP(bytes, block_size);
    819
    820    bmap_size = blocks * sizeof(uint32_t);
    821    bmap_size = ROUND_UP(bmap_size, SECTOR_SIZE);
    822
    823    memset(&header, 0, sizeof(header));
    824    pstrcpy(header.text, sizeof(header.text), VDI_TEXT);
    825    header.signature = VDI_SIGNATURE;
    826    header.version = VDI_VERSION_1_1;
    827    header.header_size = 0x180;
    828    header.image_type = image_type;
    829    header.offset_bmap = 0x200;
    830    header.offset_data = 0x200 + bmap_size;
    831    header.sector_size = SECTOR_SIZE;
    832    header.disk_size = bytes;
    833    header.block_size = block_size;
    834    header.blocks_in_image = blocks;
    835    if (image_type == VDI_TYPE_STATIC) {
    836        header.blocks_allocated = blocks;
    837    }
    838    qemu_uuid_generate(&uuid);
    839    header.uuid_image = uuid;
    840    qemu_uuid_generate(&uuid);
    841    header.uuid_last_snap = uuid;
    842    /* There is no need to set header.uuid_link or header.uuid_parent here. */
    843    if (VDI_DEBUG) {
    844        vdi_header_print(&header);
    845    }
    846    vdi_header_to_le(&header);
    847    ret = blk_pwrite(blk, offset, &header, sizeof(header), 0);
    848    if (ret < 0) {
    849        error_setg(errp, "Error writing header");
    850        goto exit;
    851    }
    852    offset += sizeof(header);
    853
    854    if (bmap_size > 0) {
    855        bmap = g_try_malloc0(bmap_size);
    856        if (bmap == NULL) {
    857            ret = -ENOMEM;
    858            error_setg(errp, "Could not allocate bmap");
    859            goto exit;
    860        }
    861        for (i = 0; i < blocks; i++) {
    862            if (image_type == VDI_TYPE_STATIC) {
    863                bmap[i] = i;
    864            } else {
    865                bmap[i] = VDI_UNALLOCATED;
    866            }
    867        }
    868        ret = blk_pwrite(blk, offset, bmap, bmap_size, 0);
    869        if (ret < 0) {
    870            error_setg(errp, "Error writing bmap");
    871            goto exit;
    872        }
    873        offset += bmap_size;
    874    }
    875
    876    if (image_type == VDI_TYPE_STATIC) {
    877        ret = blk_truncate(blk, offset + blocks * block_size, false,
    878                           PREALLOC_MODE_OFF, 0, errp);
    879        if (ret < 0) {
    880            error_prepend(errp, "Failed to statically allocate file");
    881            goto exit;
    882        }
    883    }
    884
    885    ret = 0;
    886exit:
    887    blk_unref(blk);
    888    bdrv_unref(bs_file);
    889    g_free(bmap);
    890    return ret;
    891}
    892
    893static int coroutine_fn vdi_co_create(BlockdevCreateOptions *create_options,
    894                                      Error **errp)
    895{
    896    return vdi_co_do_create(create_options, DEFAULT_CLUSTER_SIZE, errp);
    897}
    898
    899static int coroutine_fn vdi_co_create_opts(BlockDriver *drv,
    900                                           const char *filename,
    901                                           QemuOpts *opts,
    902                                           Error **errp)
    903{
    904    QDict *qdict = NULL;
    905    BlockdevCreateOptions *create_options = NULL;
    906    BlockDriverState *bs_file = NULL;
    907    uint64_t block_size = DEFAULT_CLUSTER_SIZE;
    908    bool is_static = false;
    909    Visitor *v;
    910    int ret;
    911
    912    /* Parse options and convert legacy syntax.
    913     *
    914     * Since CONFIG_VDI_BLOCK_SIZE is disabled by default,
    915     * cluster-size is not part of the QAPI schema; therefore we have
    916     * to parse it before creating the QAPI object. */
    917#if defined(CONFIG_VDI_BLOCK_SIZE)
    918    block_size = qemu_opt_get_size_del(opts,
    919                                       BLOCK_OPT_CLUSTER_SIZE,
    920                                       DEFAULT_CLUSTER_SIZE);
    921    if (block_size < BDRV_SECTOR_SIZE || block_size > UINT32_MAX ||
    922        !is_power_of_2(block_size))
    923    {
    924        error_setg(errp, "Invalid cluster size");
    925        ret = -EINVAL;
    926        goto done;
    927    }
    928#endif
    929    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_STATIC, false)) {
    930        is_static = true;
    931    }
    932
    933    qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vdi_create_opts, true);
    934
    935    /* Create and open the file (protocol layer) */
    936    ret = bdrv_create_file(filename, opts, errp);
    937    if (ret < 0) {
    938        goto done;
    939    }
    940
    941    bs_file = bdrv_open(filename, NULL, NULL,
    942                        BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
    943    if (!bs_file) {
    944        ret = -EIO;
    945        goto done;
    946    }
    947
    948    qdict_put_str(qdict, "driver", "vdi");
    949    qdict_put_str(qdict, "file", bs_file->node_name);
    950    if (is_static) {
    951        qdict_put_str(qdict, "preallocation", "metadata");
    952    }
    953
    954    /* Get the QAPI object */
    955    v = qobject_input_visitor_new_flat_confused(qdict, errp);
    956    if (!v) {
    957        ret = -EINVAL;
    958        goto done;
    959    }
    960    visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
    961    visit_free(v);
    962    if (!create_options) {
    963        ret = -EINVAL;
    964        goto done;
    965    }
    966
    967    /* Silently round up size */
    968    assert(create_options->driver == BLOCKDEV_DRIVER_VDI);
    969    create_options->u.vdi.size = ROUND_UP(create_options->u.vdi.size,
    970                                          BDRV_SECTOR_SIZE);
    971
    972    /* Create the vdi image (format layer) */
    973    ret = vdi_co_do_create(create_options, block_size, errp);
    974done:
    975    qobject_unref(qdict);
    976    qapi_free_BlockdevCreateOptions(create_options);
    977    bdrv_unref(bs_file);
    978    return ret;
    979}
    980
    981static void vdi_close(BlockDriverState *bs)
    982{
    983    BDRVVdiState *s = bs->opaque;
    984
    985    qemu_vfree(s->bmap);
    986
    987    migrate_del_blocker(s->migration_blocker);
    988    error_free(s->migration_blocker);
    989}
    990
    991static int vdi_has_zero_init(BlockDriverState *bs)
    992{
    993    BDRVVdiState *s = bs->opaque;
    994
    995    if (s->header.image_type == VDI_TYPE_STATIC) {
    996        return bdrv_has_zero_init(bs->file->bs);
    997    } else {
    998        return 1;
    999    }
   1000}
   1001
   1002static QemuOptsList vdi_create_opts = {
   1003    .name = "vdi-create-opts",
   1004    .head = QTAILQ_HEAD_INITIALIZER(vdi_create_opts.head),
   1005    .desc = {
   1006        {
   1007            .name = BLOCK_OPT_SIZE,
   1008            .type = QEMU_OPT_SIZE,
   1009            .help = "Virtual disk size"
   1010        },
   1011#if defined(CONFIG_VDI_BLOCK_SIZE)
   1012        {
   1013            .name = BLOCK_OPT_CLUSTER_SIZE,
   1014            .type = QEMU_OPT_SIZE,
   1015            .help = "VDI cluster (block) size",
   1016            .def_value_str = stringify(DEFAULT_CLUSTER_SIZE)
   1017        },
   1018#endif
   1019#if defined(CONFIG_VDI_STATIC_IMAGE)
   1020        {
   1021            .name = BLOCK_OPT_STATIC,
   1022            .type = QEMU_OPT_BOOL,
   1023            .help = "VDI static (pre-allocated) image",
   1024            .def_value_str = "off"
   1025        },
   1026#endif
   1027        /* TODO: An additional option to set UUID values might be useful. */
   1028        { /* end of list */ }
   1029    }
   1030};
   1031
   1032static BlockDriver bdrv_vdi = {
   1033    .format_name = "vdi",
   1034    .instance_size = sizeof(BDRVVdiState),
   1035    .bdrv_probe = vdi_probe,
   1036    .bdrv_open = vdi_open,
   1037    .bdrv_close = vdi_close,
   1038    .bdrv_reopen_prepare = vdi_reopen_prepare,
   1039    .bdrv_child_perm          = bdrv_default_perms,
   1040    .bdrv_co_create      = vdi_co_create,
   1041    .bdrv_co_create_opts = vdi_co_create_opts,
   1042    .bdrv_has_zero_init  = vdi_has_zero_init,
   1043    .bdrv_co_block_status = vdi_co_block_status,
   1044    .bdrv_make_empty = vdi_make_empty,
   1045
   1046    .bdrv_co_preadv     = vdi_co_preadv,
   1047#if defined(CONFIG_VDI_WRITE)
   1048    .bdrv_co_pwritev    = vdi_co_pwritev,
   1049#endif
   1050
   1051    .bdrv_get_info = vdi_get_info,
   1052
   1053    .is_format = true,
   1054    .create_opts = &vdi_create_opts,
   1055    .bdrv_co_check = vdi_co_check,
   1056};
   1057
   1058static void bdrv_vdi_init(void)
   1059{
   1060    logout("\n");
   1061    bdrv_register(&bdrv_vdi);
   1062}
   1063
   1064block_init(bdrv_vdi_init);