cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

vpc.c (37865B)


      1/*
      2 * Block driver for Connectix / Microsoft Virtual PC images
      3 *
      4 * Copyright (c) 2005 Alex Beregszaszi
      5 * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
      6 *
      7 * Permission is hereby granted, free of charge, to any person obtaining a copy
      8 * of this software and associated documentation files (the "Software"), to deal
      9 * in the Software without restriction, including without limitation the rights
     10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     11 * copies of the Software, and to permit persons to whom the Software is
     12 * furnished to do so, subject to the following conditions:
     13 *
     14 * The above copyright notice and this permission notice shall be included in
     15 * all copies or substantial portions of the Software.
     16 *
     17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     23 * THE SOFTWARE.
     24 */
     25
     26#include "qemu/osdep.h"
     27#include "qapi/error.h"
     28#include "block/block_int.h"
     29#include "block/qdict.h"
     30#include "sysemu/block-backend.h"
     31#include "qemu/module.h"
     32#include "qemu/option.h"
     33#include "migration/blocker.h"
     34#include "qemu/bswap.h"
     35#include "qemu/uuid.h"
     36#include "qapi/qmp/qdict.h"
     37#include "qapi/qobject-input-visitor.h"
     38#include "qapi/qapi-visit-block-core.h"
     39
     40/**************************************************************/
     41
     42//#define CACHE
     43
     44enum vhd_type {
     45    VHD_FIXED           = 2,
     46    VHD_DYNAMIC         = 3,
     47    VHD_DIFFERENCING    = 4,
     48};
     49
     50/* Seconds since Jan 1, 2000 0:00:00 (UTC) */
     51#define VHD_TIMESTAMP_BASE 946684800
     52
     53#define VHD_CHS_MAX_C   65535LL
     54#define VHD_CHS_MAX_H   16
     55#define VHD_CHS_MAX_S   255
     56
     57#define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
     58#define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
     59
     60#define VPC_OPT_FORCE_SIZE "force_size"
     61
     62/* always big-endian */
     63typedef struct vhd_footer {
     64    char        creator[8]; /* "conectix" */
     65    uint32_t    features;
     66    uint32_t    version;
     67
     68    /* Offset of next header structure, 0xFFFFFFFF if none */
     69    uint64_t    data_offset;
     70
     71    /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
     72    uint32_t    timestamp;
     73
     74    char        creator_app[4]; /*  e.g., "vpc " */
     75    uint16_t    major;
     76    uint16_t    minor;
     77    char        creator_os[4]; /* "Wi2k" */
     78
     79    uint64_t    orig_size;
     80    uint64_t    current_size;
     81
     82    uint16_t    cyls;
     83    uint8_t     heads;
     84    uint8_t     secs_per_cyl;
     85
     86    uint32_t    type;
     87
     88    /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
     89       the bytes in the footer without the checksum field") */
     90    uint32_t    checksum;
     91
     92    /* UUID used to identify a parent hard disk (backing file) */
     93    QemuUUID    uuid;
     94
     95    uint8_t     in_saved_state;
     96    uint8_t     reserved[427];
     97} QEMU_PACKED VHDFooter;
     98
     99QEMU_BUILD_BUG_ON(sizeof(VHDFooter) != 512);
    100
    101typedef struct vhd_dyndisk_header {
    102    char        magic[8]; /* "cxsparse" */
    103
    104    /* Offset of next header structure, 0xFFFFFFFF if none */
    105    uint64_t    data_offset;
    106
    107    /* Offset of the Block Allocation Table (BAT) */
    108    uint64_t    table_offset;
    109
    110    uint32_t    version;
    111    uint32_t    max_table_entries; /* 32bit/entry */
    112
    113    /* 2 MB by default, must be a power of two */
    114    uint32_t    block_size;
    115
    116    uint32_t    checksum;
    117    uint8_t     parent_uuid[16];
    118    uint32_t    parent_timestamp;
    119    uint32_t    reserved;
    120
    121    /* Backing file name (in UTF-16) */
    122    uint8_t     parent_name[512];
    123
    124    struct {
    125        uint32_t    platform;
    126        uint32_t    data_space;
    127        uint32_t    data_length;
    128        uint32_t    reserved;
    129        uint64_t    data_offset;
    130    } parent_locator[8];
    131    uint8_t     reserved2[256];
    132} QEMU_PACKED VHDDynDiskHeader;
    133
    134QEMU_BUILD_BUG_ON(sizeof(VHDDynDiskHeader) != 1024);
    135
    136typedef struct BDRVVPCState {
    137    CoMutex lock;
    138    VHDFooter footer;
    139    uint64_t free_data_block_offset;
    140    int max_table_entries;
    141    uint32_t *pagetable;
    142    uint64_t bat_offset;
    143    uint64_t last_bitmap_offset;
    144
    145    uint32_t block_size;
    146    uint32_t bitmap_size;
    147    bool force_use_chs;
    148    bool force_use_sz;
    149
    150#ifdef CACHE
    151    uint8_t *pageentry_u8;
    152    uint32_t *pageentry_u32;
    153    uint16_t *pageentry_u16;
    154
    155    uint64_t last_bitmap;
    156#endif
    157
    158    Error *migration_blocker;
    159} BDRVVPCState;
    160
    161#define VPC_OPT_SIZE_CALC "force_size_calc"
    162static QemuOptsList vpc_runtime_opts = {
    163    .name = "vpc-runtime-opts",
    164    .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
    165    .desc = {
    166        {
    167            .name = VPC_OPT_SIZE_CALC,
    168            .type = QEMU_OPT_STRING,
    169            .help = "Force disk size calculation to use either CHS geometry, "
    170                    "or use the disk current_size specified in the VHD footer. "
    171                    "{chs, current_size}"
    172        },
    173        { /* end of list */ }
    174    }
    175};
    176
    177static QemuOptsList vpc_create_opts;
    178
    179static uint32_t vpc_checksum(void *p, size_t size)
    180{
    181    uint8_t *buf = p;
    182    uint32_t res = 0;
    183    int i;
    184
    185    for (i = 0; i < size; i++)
    186        res += buf[i];
    187
    188    return ~res;
    189}
    190
    191
    192static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
    193{
    194    if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
    195        return 100;
    196    return 0;
    197}
    198
    199static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
    200                              Error **errp)
    201{
    202    BDRVVPCState *s = bs->opaque;
    203    const char *size_calc;
    204
    205    size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
    206
    207    if (!size_calc) {
    208       /* no override, use autodetect only */
    209    } else if (!strcmp(size_calc, "current_size")) {
    210        s->force_use_sz = true;
    211    } else if (!strcmp(size_calc, "chs")) {
    212        s->force_use_chs = true;
    213    } else {
    214        error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
    215    }
    216}
    217
    218static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    219                    Error **errp)
    220{
    221    BDRVVPCState *s = bs->opaque;
    222    int i;
    223    VHDFooter *footer;
    224    QemuOpts *opts = NULL;
    225    Error *local_err = NULL;
    226    bool use_chs;
    227    VHDDynDiskHeader dyndisk_header;
    228    uint32_t checksum;
    229    uint64_t computed_size;
    230    uint64_t pagetable_size;
    231    int disk_type = VHD_DYNAMIC;
    232    int ret;
    233    int64_t bs_size;
    234
    235    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
    236                               BDRV_CHILD_IMAGE, false, errp);
    237    if (!bs->file) {
    238        return -EINVAL;
    239    }
    240
    241    opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
    242    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
    243        ret = -EINVAL;
    244        goto fail;
    245    }
    246
    247    vpc_parse_options(bs, opts, &local_err);
    248    if (local_err) {
    249        error_propagate(errp, local_err);
    250        ret = -EINVAL;
    251        goto fail;
    252    }
    253
    254    ret = bdrv_pread(bs->file, 0, &s->footer, sizeof(s->footer));
    255    if (ret < 0) {
    256        error_setg(errp, "Unable to read VHD header");
    257        goto fail;
    258    }
    259
    260    footer = &s->footer;
    261    if (strncmp(footer->creator, "conectix", 8)) {
    262        int64_t offset = bdrv_getlength(bs->file->bs);
    263        if (offset < 0) {
    264            ret = offset;
    265            error_setg(errp, "Invalid file size");
    266            goto fail;
    267        } else if (offset < sizeof(*footer)) {
    268            ret = -EINVAL;
    269            error_setg(errp, "File too small for a VHD header");
    270            goto fail;
    271        }
    272
    273        /* If a fixed disk, the footer is found only at the end of the file */
    274        ret = bdrv_pread(bs->file, offset - sizeof(*footer),
    275                         footer, sizeof(*footer));
    276        if (ret < 0) {
    277            goto fail;
    278        }
    279        if (strncmp(footer->creator, "conectix", 8)) {
    280            error_setg(errp, "invalid VPC image");
    281            ret = -EINVAL;
    282            goto fail;
    283        }
    284        disk_type = VHD_FIXED;
    285    }
    286
    287    checksum = be32_to_cpu(footer->checksum);
    288    footer->checksum = 0;
    289    if (vpc_checksum(footer, sizeof(*footer)) != checksum) {
    290        error_setg(errp, "Incorrect header checksum");
    291        ret = -EINVAL;
    292        goto fail;
    293    }
    294
    295    /* Write 'checksum' back to footer, or else will leave it with zero. */
    296    footer->checksum = cpu_to_be32(checksum);
    297
    298    /* The visible size of a image in Virtual PC depends on the geometry
    299       rather than on the size stored in the footer (the size in the footer
    300       is too large usually) */
    301    bs->total_sectors = (int64_t)
    302        be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
    303
    304    /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
    305     * VHD image sizes differently.  VPC will rely on CHS geometry,
    306     * while Hyper-V and disk2vhd use the size specified in the footer.
    307     *
    308     * We use a couple of approaches to try and determine the correct method:
    309     * look at the Creator App field, and look for images that have CHS
    310     * geometry that is the maximum value.
    311     *
    312     * If the CHS geometry is the maximum CHS geometry, then we assume that
    313     * the size is the footer->current_size to avoid truncation.  Otherwise,
    314     * we follow the table based on footer->creator_app:
    315     *
    316     *  Known creator apps:
    317     *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
    318     *      'qemu'  :  CHS              QEMU (uses disk geometry)
    319     *      'qem2'  :  current_size     QEMU (uses current_size)
    320     *      'win '  :  current_size     Hyper-V
    321     *      'd2v '  :  current_size     Disk2vhd
    322     *      'tap\0' :  current_size     XenServer
    323     *      'CTXS'  :  current_size     XenConverter
    324     *
    325     *  The user can override the table values via drive options, however
    326     *  even with an override we will still use current_size for images
    327     *  that have CHS geometry of the maximum size.
    328     */
    329    use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
    330               !!strncmp(footer->creator_app, "qem2", 4) &&
    331               !!strncmp(footer->creator_app, "d2v ", 4) &&
    332               !!strncmp(footer->creator_app, "CTXS", 4) &&
    333               !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
    334
    335    if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
    336        bs->total_sectors = be64_to_cpu(footer->current_size) /
    337                                        BDRV_SECTOR_SIZE;
    338    }
    339
    340    /* Allow a maximum disk size of 2040 GiB */
    341    if (bs->total_sectors > VHD_MAX_SECTORS) {
    342        ret = -EFBIG;
    343        goto fail;
    344    }
    345
    346    if (disk_type == VHD_DYNAMIC) {
    347        ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset),
    348                         &dyndisk_header, sizeof(dyndisk_header));
    349        if (ret < 0) {
    350            error_setg(errp, "Error reading dynamic VHD header");
    351            goto fail;
    352        }
    353
    354        if (strncmp(dyndisk_header.magic, "cxsparse", 8)) {
    355            error_setg(errp, "Invalid header magic");
    356            ret = -EINVAL;
    357            goto fail;
    358        }
    359
    360        s->block_size = be32_to_cpu(dyndisk_header.block_size);
    361        if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
    362            error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
    363            ret = -EINVAL;
    364            goto fail;
    365        }
    366        s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
    367
    368        s->max_table_entries = be32_to_cpu(dyndisk_header.max_table_entries);
    369
    370        if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
    371            error_setg(errp, "Too many blocks");
    372            ret = -EINVAL;
    373            goto fail;
    374        }
    375
    376        computed_size = (uint64_t) s->max_table_entries * s->block_size;
    377        if (computed_size < bs->total_sectors * 512) {
    378            error_setg(errp, "Page table too small");
    379            ret = -EINVAL;
    380            goto fail;
    381        }
    382
    383        if (s->max_table_entries > SIZE_MAX / 4 ||
    384            s->max_table_entries > (int) INT_MAX / 4) {
    385            error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
    386                        s->max_table_entries);
    387            ret = -EINVAL;
    388            goto fail;
    389        }
    390
    391        pagetable_size = (uint64_t) s->max_table_entries * 4;
    392
    393        s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
    394        if (s->pagetable == NULL) {
    395            error_setg(errp, "Unable to allocate memory for page table");
    396            ret = -ENOMEM;
    397            goto fail;
    398        }
    399
    400        s->bat_offset = be64_to_cpu(dyndisk_header.table_offset);
    401
    402        ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
    403                         pagetable_size);
    404        if (ret < 0) {
    405            error_setg(errp, "Error reading pagetable");
    406            goto fail;
    407        }
    408
    409        s->free_data_block_offset =
    410            ROUND_UP(s->bat_offset + pagetable_size, 512);
    411
    412        for (i = 0; i < s->max_table_entries; i++) {
    413            be32_to_cpus(&s->pagetable[i]);
    414            if (s->pagetable[i] != 0xFFFFFFFF) {
    415                int64_t next = (512 * (int64_t) s->pagetable[i]) +
    416                    s->bitmap_size + s->block_size;
    417
    418                if (next > s->free_data_block_offset) {
    419                    s->free_data_block_offset = next;
    420                }
    421            }
    422        }
    423
    424        bs_size = bdrv_getlength(bs->file->bs);
    425        if (bs_size < 0) {
    426            error_setg_errno(errp, -bs_size, "Unable to learn image size");
    427            ret = bs_size;
    428            goto fail;
    429        }
    430        if (s->free_data_block_offset > bs_size) {
    431            error_setg(errp, "block-vpc: free_data_block_offset points after "
    432                             "the end of file. The image has been truncated.");
    433            ret = -EINVAL;
    434            goto fail;
    435        }
    436
    437        s->last_bitmap_offset = (int64_t) -1;
    438
    439#ifdef CACHE
    440        s->pageentry_u8 = g_malloc(512);
    441        s->pageentry_u32 = s->pageentry_u8;
    442        s->pageentry_u16 = s->pageentry_u8;
    443        s->last_pagetable = -1;
    444#endif
    445    }
    446
    447    /* Disable migration when VHD images are used */
    448    error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
    449               "does not support live migration",
    450               bdrv_get_device_or_node_name(bs));
    451    ret = migrate_add_blocker(s->migration_blocker, errp);
    452    if (ret < 0) {
    453        error_free(s->migration_blocker);
    454        goto fail;
    455    }
    456
    457    qemu_co_mutex_init(&s->lock);
    458    qemu_opts_del(opts);
    459
    460    return 0;
    461
    462fail:
    463    qemu_opts_del(opts);
    464    qemu_vfree(s->pagetable);
    465#ifdef CACHE
    466    g_free(s->pageentry_u8);
    467#endif
    468    return ret;
    469}
    470
    471static int vpc_reopen_prepare(BDRVReopenState *state,
    472                              BlockReopenQueue *queue, Error **errp)
    473{
    474    return 0;
    475}
    476
    477/*
    478 * Returns the absolute byte offset of the given sector in the image file.
    479 * If the sector is not allocated, -1 is returned instead.
    480 * If an error occurred trying to write an updated block bitmap back to
    481 * the file, -2 is returned, and the error value is written to *err.
    482 * This can only happen for a write operation.
    483 *
    484 * The parameter write must be 1 if the offset will be used for a write
    485 * operation (the block bitmaps is updated then), 0 otherwise.
    486 * If write is true then err must not be NULL.
    487 */
    488static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
    489                                       bool write, int *err)
    490{
    491    BDRVVPCState *s = bs->opaque;
    492    uint64_t bitmap_offset, block_offset;
    493    uint32_t pagetable_index, offset_in_block;
    494
    495    assert(!(write && err == NULL));
    496
    497    pagetable_index = offset / s->block_size;
    498    offset_in_block = offset % s->block_size;
    499
    500    if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
    501        return -1; /* not allocated */
    502
    503    bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
    504    block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
    505
    506    /* We must ensure that we don't write to any sectors which are marked as
    507       unused in the bitmap. We get away with setting all bits in the block
    508       bitmap each time we write to a new block. This might cause Virtual PC to
    509       miss sparse read optimization, but it's not a problem in terms of
    510       correctness. */
    511    if (write && (s->last_bitmap_offset != bitmap_offset)) {
    512        uint8_t bitmap[s->bitmap_size];
    513        int r;
    514
    515        s->last_bitmap_offset = bitmap_offset;
    516        memset(bitmap, 0xff, s->bitmap_size);
    517        r = bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
    518        if (r < 0) {
    519            *err = r;
    520            return -2;
    521        }
    522    }
    523
    524    return block_offset;
    525}
    526
    527/*
    528 * Writes the footer to the end of the image file. This is needed when the
    529 * file grows as it overwrites the old footer
    530 *
    531 * Returns 0 on success and < 0 on error
    532 */
    533static int rewrite_footer(BlockDriverState *bs)
    534{
    535    int ret;
    536    BDRVVPCState *s = bs->opaque;
    537    int64_t offset = s->free_data_block_offset;
    538
    539    ret = bdrv_pwrite_sync(bs->file, offset, &s->footer, sizeof(s->footer));
    540    if (ret < 0)
    541        return ret;
    542
    543    return 0;
    544}
    545
    546/*
    547 * Allocates a new block. This involves writing a new footer and updating
    548 * the Block Allocation Table to use the space at the old end of the image
    549 * file (overwriting the old footer)
    550 *
    551 * Returns the sectors' offset in the image file on success and < 0 on error
    552 */
    553static int64_t alloc_block(BlockDriverState *bs, int64_t offset)
    554{
    555    BDRVVPCState *s = bs->opaque;
    556    int64_t bat_offset;
    557    uint32_t index, bat_value;
    558    int ret;
    559    uint8_t bitmap[s->bitmap_size];
    560
    561    /* Check if sector_num is valid */
    562    if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
    563        return -EINVAL;
    564    }
    565
    566    /* Write entry into in-memory BAT */
    567    index = offset / s->block_size;
    568    assert(s->pagetable[index] == 0xFFFFFFFF);
    569    s->pagetable[index] = s->free_data_block_offset / 512;
    570
    571    /* Initialize the block's bitmap */
    572    memset(bitmap, 0xff, s->bitmap_size);
    573    ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
    574        s->bitmap_size);
    575    if (ret < 0) {
    576        return ret;
    577    }
    578
    579    /* Write new footer (the old one will be overwritten) */
    580    s->free_data_block_offset += s->block_size + s->bitmap_size;
    581    ret = rewrite_footer(bs);
    582    if (ret < 0)
    583        goto fail;
    584
    585    /* Write BAT entry to disk */
    586    bat_offset = s->bat_offset + (4 * index);
    587    bat_value = cpu_to_be32(s->pagetable[index]);
    588    ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
    589    if (ret < 0)
    590        goto fail;
    591
    592    return get_image_offset(bs, offset, false, NULL);
    593
    594fail:
    595    s->free_data_block_offset -= (s->block_size + s->bitmap_size);
    596    return ret;
    597}
    598
    599static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    600{
    601    BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
    602
    603    if (be32_to_cpu(s->footer.type) != VHD_FIXED) {
    604        bdi->cluster_size = s->block_size;
    605    }
    606
    607    return 0;
    608}
    609
    610static int coroutine_fn
    611vpc_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
    612              QEMUIOVector *qiov, BdrvRequestFlags flags)
    613{
    614    BDRVVPCState *s = bs->opaque;
    615    int ret;
    616    int64_t image_offset;
    617    int64_t n_bytes;
    618    int64_t bytes_done = 0;
    619    QEMUIOVector local_qiov;
    620
    621    if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
    622        return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
    623    }
    624
    625    qemu_co_mutex_lock(&s->lock);
    626    qemu_iovec_init(&local_qiov, qiov->niov);
    627
    628    while (bytes > 0) {
    629        image_offset = get_image_offset(bs, offset, false, NULL);
    630        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
    631
    632        if (image_offset == -1) {
    633            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
    634        } else {
    635            qemu_iovec_reset(&local_qiov);
    636            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
    637
    638            qemu_co_mutex_unlock(&s->lock);
    639            ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
    640                                 &local_qiov, 0);
    641            qemu_co_mutex_lock(&s->lock);
    642            if (ret < 0) {
    643                goto fail;
    644            }
    645        }
    646
    647        bytes -= n_bytes;
    648        offset += n_bytes;
    649        bytes_done += n_bytes;
    650    }
    651
    652    ret = 0;
    653fail:
    654    qemu_iovec_destroy(&local_qiov);
    655    qemu_co_mutex_unlock(&s->lock);
    656
    657    return ret;
    658}
    659
    660static int coroutine_fn
    661vpc_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
    662               QEMUIOVector *qiov, BdrvRequestFlags flags)
    663{
    664    BDRVVPCState *s = bs->opaque;
    665    int64_t image_offset;
    666    int64_t n_bytes;
    667    int64_t bytes_done = 0;
    668    int ret = 0;
    669    QEMUIOVector local_qiov;
    670
    671    if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
    672        return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
    673    }
    674
    675    qemu_co_mutex_lock(&s->lock);
    676    qemu_iovec_init(&local_qiov, qiov->niov);
    677
    678    while (bytes > 0) {
    679        image_offset = get_image_offset(bs, offset, true, &ret);
    680        if (image_offset == -2) {
    681            /* Failed to write block bitmap: can't proceed with write */
    682            goto fail;
    683        }
    684        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
    685
    686        if (image_offset == -1) {
    687            image_offset = alloc_block(bs, offset);
    688            if (image_offset < 0) {
    689                ret = image_offset;
    690                goto fail;
    691            }
    692        }
    693
    694        qemu_iovec_reset(&local_qiov);
    695        qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
    696
    697        qemu_co_mutex_unlock(&s->lock);
    698        ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
    699                              &local_qiov, 0);
    700        qemu_co_mutex_lock(&s->lock);
    701        if (ret < 0) {
    702            goto fail;
    703        }
    704
    705        bytes -= n_bytes;
    706        offset += n_bytes;
    707        bytes_done += n_bytes;
    708    }
    709
    710    ret = 0;
    711fail:
    712    qemu_iovec_destroy(&local_qiov);
    713    qemu_co_mutex_unlock(&s->lock);
    714
    715    return ret;
    716}
    717
    718static int coroutine_fn vpc_co_block_status(BlockDriverState *bs,
    719                                            bool want_zero,
    720                                            int64_t offset, int64_t bytes,
    721                                            int64_t *pnum, int64_t *map,
    722                                            BlockDriverState **file)
    723{
    724    BDRVVPCState *s = bs->opaque;
    725    int64_t image_offset;
    726    bool allocated;
    727    int ret;
    728    int64_t n;
    729
    730    if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
    731        *pnum = bytes;
    732        *map = offset;
    733        *file = bs->file->bs;
    734        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_RECURSE;
    735    }
    736
    737    qemu_co_mutex_lock(&s->lock);
    738
    739    image_offset = get_image_offset(bs, offset, false, NULL);
    740    allocated = (image_offset != -1);
    741    *pnum = 0;
    742    ret = BDRV_BLOCK_ZERO;
    743
    744    do {
    745        /* All sectors in a block are contiguous (without using the bitmap) */
    746        n = ROUND_UP(offset + 1, s->block_size) - offset;
    747        n = MIN(n, bytes);
    748
    749        *pnum += n;
    750        offset += n;
    751        bytes -= n;
    752        /* *pnum can't be greater than one block for allocated
    753         * sectors since there is always a bitmap in between. */
    754        if (allocated) {
    755            *file = bs->file->bs;
    756            *map = image_offset;
    757            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
    758            break;
    759        }
    760        if (bytes == 0) {
    761            break;
    762        }
    763        image_offset = get_image_offset(bs, offset, false, NULL);
    764    } while (image_offset == -1);
    765
    766    qemu_co_mutex_unlock(&s->lock);
    767    return ret;
    768}
    769
    770/*
    771 * Calculates the number of cylinders, heads and sectors per cylinder
    772 * based on a given number of sectors. This is the algorithm described
    773 * in the VHD specification.
    774 *
    775 * Note that the geometry doesn't always exactly match total_sectors but
    776 * may round it down.
    777 *
    778 * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
    779 * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
    780 * and instead allow up to 255 heads.
    781 */
    782static int calculate_geometry(int64_t total_sectors, uint16_t *cyls,
    783    uint8_t *heads, uint8_t *secs_per_cyl)
    784{
    785    uint32_t cyls_times_heads;
    786
    787    total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
    788
    789    if (total_sectors >= 65535LL * 16 * 63) {
    790        *secs_per_cyl = 255;
    791        *heads = 16;
    792        cyls_times_heads = total_sectors / *secs_per_cyl;
    793    } else {
    794        *secs_per_cyl = 17;
    795        cyls_times_heads = total_sectors / *secs_per_cyl;
    796        *heads = DIV_ROUND_UP(cyls_times_heads, 1024);
    797
    798        if (*heads < 4) {
    799            *heads = 4;
    800        }
    801
    802        if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
    803            *secs_per_cyl = 31;
    804            *heads = 16;
    805            cyls_times_heads = total_sectors / *secs_per_cyl;
    806        }
    807
    808        if (cyls_times_heads >= (*heads * 1024)) {
    809            *secs_per_cyl = 63;
    810            *heads = 16;
    811            cyls_times_heads = total_sectors / *secs_per_cyl;
    812        }
    813    }
    814
    815    *cyls = cyls_times_heads / *heads;
    816
    817    return 0;
    818}
    819
    820static int create_dynamic_disk(BlockBackend *blk, VHDFooter *footer,
    821                               int64_t total_sectors)
    822{
    823    VHDDynDiskHeader dyndisk_header;
    824    uint8_t bat_sector[512];
    825    size_t block_size, num_bat_entries;
    826    int i;
    827    int ret;
    828    int64_t offset = 0;
    829
    830    /* Write the footer (twice: at the beginning and at the end) */
    831    block_size = 0x200000;
    832    num_bat_entries = DIV_ROUND_UP(total_sectors, block_size / 512);
    833
    834    ret = blk_pwrite(blk, offset, footer, sizeof(*footer), 0);
    835    if (ret < 0) {
    836        goto fail;
    837    }
    838
    839    offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
    840    ret = blk_pwrite(blk, offset, footer, sizeof(*footer), 0);
    841    if (ret < 0) {
    842        goto fail;
    843    }
    844
    845    /* Write the initial BAT */
    846    offset = 3 * 512;
    847
    848    memset(bat_sector, 0xFF, 512);
    849    for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
    850        ret = blk_pwrite(blk, offset, bat_sector, 512, 0);
    851        if (ret < 0) {
    852            goto fail;
    853        }
    854        offset += 512;
    855    }
    856
    857    /* Prepare the Dynamic Disk Header */
    858    memset(&dyndisk_header, 0, sizeof(dyndisk_header));
    859
    860    memcpy(dyndisk_header.magic, "cxsparse", 8);
    861
    862    /*
    863     * Note: The spec is actually wrong here for data_offset, it says
    864     * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
    865     */
    866    dyndisk_header.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
    867    dyndisk_header.table_offset = cpu_to_be64(3 * 512);
    868    dyndisk_header.version = cpu_to_be32(0x00010000);
    869    dyndisk_header.block_size = cpu_to_be32(block_size);
    870    dyndisk_header.max_table_entries = cpu_to_be32(num_bat_entries);
    871
    872    dyndisk_header.checksum = cpu_to_be32(
    873        vpc_checksum(&dyndisk_header, sizeof(dyndisk_header)));
    874
    875    /* Write the header */
    876    offset = 512;
    877
    878    ret = blk_pwrite(blk, offset, &dyndisk_header, sizeof(dyndisk_header), 0);
    879    if (ret < 0) {
    880        goto fail;
    881    }
    882
    883    ret = 0;
    884 fail:
    885    return ret;
    886}
    887
    888static int create_fixed_disk(BlockBackend *blk, VHDFooter *footer,
    889                             int64_t total_size, Error **errp)
    890{
    891    int ret;
    892
    893    /* Add footer to total size */
    894    total_size += sizeof(*footer);
    895
    896    ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp);
    897    if (ret < 0) {
    898        return ret;
    899    }
    900
    901    ret = blk_pwrite(blk, total_size - sizeof(*footer),
    902                     footer, sizeof(*footer), 0);
    903    if (ret < 0) {
    904        error_setg_errno(errp, -ret, "Unable to write VHD header");
    905        return ret;
    906    }
    907
    908    return 0;
    909}
    910
    911static int calculate_rounded_image_size(BlockdevCreateOptionsVpc *vpc_opts,
    912                                        uint16_t *out_cyls,
    913                                        uint8_t *out_heads,
    914                                        uint8_t *out_secs_per_cyl,
    915                                        int64_t *out_total_sectors,
    916                                        Error **errp)
    917{
    918    int64_t total_size = vpc_opts->size;
    919    uint16_t cyls = 0;
    920    uint8_t heads = 0;
    921    uint8_t secs_per_cyl = 0;
    922    int64_t total_sectors;
    923    int i;
    924
    925    /*
    926     * Calculate matching total_size and geometry. Increase the number of
    927     * sectors requested until we get enough (or fail). This ensures that
    928     * qemu-img convert doesn't truncate images, but rather rounds up.
    929     *
    930     * If the image size can't be represented by a spec conformant CHS geometry,
    931     * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
    932     * the image size from the VHD footer to calculate total_sectors.
    933     */
    934    if (vpc_opts->force_size) {
    935        /* This will force the use of total_size for sector count, below */
    936        cyls         = VHD_CHS_MAX_C;
    937        heads        = VHD_CHS_MAX_H;
    938        secs_per_cyl = VHD_CHS_MAX_S;
    939    } else {
    940        total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
    941        for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
    942            calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
    943        }
    944    }
    945
    946    if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
    947        total_sectors = total_size / BDRV_SECTOR_SIZE;
    948        /* Allow a maximum disk size of 2040 GiB */
    949        if (total_sectors > VHD_MAX_SECTORS) {
    950            error_setg(errp, "Disk size is too large, max size is 2040 GiB");
    951            return -EFBIG;
    952        }
    953    } else {
    954        total_sectors = (int64_t) cyls * heads * secs_per_cyl;
    955    }
    956
    957    *out_total_sectors = total_sectors;
    958    if (out_cyls) {
    959        *out_cyls = cyls;
    960        *out_heads = heads;
    961        *out_secs_per_cyl = secs_per_cyl;
    962    }
    963
    964    return 0;
    965}
    966
    967static int coroutine_fn vpc_co_create(BlockdevCreateOptions *opts,
    968                                      Error **errp)
    969{
    970    BlockdevCreateOptionsVpc *vpc_opts;
    971    BlockBackend *blk = NULL;
    972    BlockDriverState *bs = NULL;
    973
    974    VHDFooter footer;
    975    uint16_t cyls = 0;
    976    uint8_t heads = 0;
    977    uint8_t secs_per_cyl = 0;
    978    int64_t total_sectors;
    979    int64_t total_size;
    980    int disk_type;
    981    int ret = -EIO;
    982    QemuUUID uuid;
    983
    984    assert(opts->driver == BLOCKDEV_DRIVER_VPC);
    985    vpc_opts = &opts->u.vpc;
    986
    987    /* Validate options and set default values */
    988    total_size = vpc_opts->size;
    989
    990    if (!vpc_opts->has_subformat) {
    991        vpc_opts->subformat = BLOCKDEV_VPC_SUBFORMAT_DYNAMIC;
    992    }
    993    switch (vpc_opts->subformat) {
    994    case BLOCKDEV_VPC_SUBFORMAT_DYNAMIC:
    995        disk_type = VHD_DYNAMIC;
    996        break;
    997    case BLOCKDEV_VPC_SUBFORMAT_FIXED:
    998        disk_type = VHD_FIXED;
    999        break;
   1000    default:
   1001        g_assert_not_reached();
   1002    }
   1003
   1004    /* Create BlockBackend to write to the image */
   1005    bs = bdrv_open_blockdev_ref(vpc_opts->file, errp);
   1006    if (bs == NULL) {
   1007        return -EIO;
   1008    }
   1009
   1010    blk = blk_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
   1011                          errp);
   1012    if (!blk) {
   1013        ret = -EPERM;
   1014        goto out;
   1015    }
   1016    blk_set_allow_write_beyond_eof(blk, true);
   1017
   1018    /* Get geometry and check that it matches the image size*/
   1019    ret = calculate_rounded_image_size(vpc_opts, &cyls, &heads, &secs_per_cyl,
   1020                                       &total_sectors, errp);
   1021    if (ret < 0) {
   1022        goto out;
   1023    }
   1024
   1025    if (total_size != total_sectors * BDRV_SECTOR_SIZE) {
   1026        error_setg(errp, "The requested image size cannot be represented in "
   1027                         "CHS geometry");
   1028        error_append_hint(errp, "Try size=%llu or force-size=on (the "
   1029                                "latter makes the image incompatible with "
   1030                                "Virtual PC)",
   1031                          total_sectors * BDRV_SECTOR_SIZE);
   1032        ret = -EINVAL;
   1033        goto out;
   1034    }
   1035
   1036    /* Prepare the Hard Disk Footer */
   1037    memset(&footer, 0, sizeof(footer));
   1038
   1039    memcpy(footer.creator, "conectix", 8);
   1040    if (vpc_opts->force_size) {
   1041        memcpy(footer.creator_app, "qem2", 4);
   1042    } else {
   1043        memcpy(footer.creator_app, "qemu", 4);
   1044    }
   1045    memcpy(footer.creator_os, "Wi2k", 4);
   1046
   1047    footer.features = cpu_to_be32(0x02);
   1048    footer.version = cpu_to_be32(0x00010000);
   1049    if (disk_type == VHD_DYNAMIC) {
   1050        footer.data_offset = cpu_to_be64(sizeof(footer));
   1051    } else {
   1052        footer.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
   1053    }
   1054    footer.timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
   1055
   1056    /* Version of Virtual PC 2007 */
   1057    footer.major = cpu_to_be16(0x0005);
   1058    footer.minor = cpu_to_be16(0x0003);
   1059    footer.orig_size = cpu_to_be64(total_size);
   1060    footer.current_size = cpu_to_be64(total_size);
   1061    footer.cyls = cpu_to_be16(cyls);
   1062    footer.heads = heads;
   1063    footer.secs_per_cyl = secs_per_cyl;
   1064
   1065    footer.type = cpu_to_be32(disk_type);
   1066
   1067    qemu_uuid_generate(&uuid);
   1068    footer.uuid = uuid;
   1069
   1070    footer.checksum = cpu_to_be32(vpc_checksum(&footer, sizeof(footer)));
   1071
   1072    if (disk_type == VHD_DYNAMIC) {
   1073        ret = create_dynamic_disk(blk, &footer, total_sectors);
   1074        if (ret < 0) {
   1075            error_setg(errp, "Unable to create or write VHD header");
   1076        }
   1077    } else {
   1078        ret = create_fixed_disk(blk, &footer, total_size, errp);
   1079    }
   1080
   1081out:
   1082    blk_unref(blk);
   1083    bdrv_unref(bs);
   1084    return ret;
   1085}
   1086
   1087static int coroutine_fn vpc_co_create_opts(BlockDriver *drv,
   1088                                           const char *filename,
   1089                                           QemuOpts *opts,
   1090                                           Error **errp)
   1091{
   1092    BlockdevCreateOptions *create_options = NULL;
   1093    QDict *qdict;
   1094    Visitor *v;
   1095    BlockDriverState *bs = NULL;
   1096    int ret;
   1097
   1098    static const QDictRenames opt_renames[] = {
   1099        { VPC_OPT_FORCE_SIZE,           "force-size" },
   1100        { NULL, NULL },
   1101    };
   1102
   1103    /* Parse options and convert legacy syntax */
   1104    qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vpc_create_opts, true);
   1105
   1106    if (!qdict_rename_keys(qdict, opt_renames, errp)) {
   1107        ret = -EINVAL;
   1108        goto fail;
   1109    }
   1110
   1111    /* Create and open the file (protocol layer) */
   1112    ret = bdrv_create_file(filename, opts, errp);
   1113    if (ret < 0) {
   1114        goto fail;
   1115    }
   1116
   1117    bs = bdrv_open(filename, NULL, NULL,
   1118                   BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
   1119    if (bs == NULL) {
   1120        ret = -EIO;
   1121        goto fail;
   1122    }
   1123
   1124    /* Now get the QAPI type BlockdevCreateOptions */
   1125    qdict_put_str(qdict, "driver", "vpc");
   1126    qdict_put_str(qdict, "file", bs->node_name);
   1127
   1128    v = qobject_input_visitor_new_flat_confused(qdict, errp);
   1129    if (!v) {
   1130        ret = -EINVAL;
   1131        goto fail;
   1132    }
   1133
   1134    visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
   1135    visit_free(v);
   1136    if (!create_options) {
   1137        ret = -EINVAL;
   1138        goto fail;
   1139    }
   1140
   1141    /* Silently round up size */
   1142    assert(create_options->driver == BLOCKDEV_DRIVER_VPC);
   1143    create_options->u.vpc.size =
   1144        ROUND_UP(create_options->u.vpc.size, BDRV_SECTOR_SIZE);
   1145
   1146    if (!create_options->u.vpc.force_size) {
   1147        int64_t total_sectors;
   1148        ret = calculate_rounded_image_size(&create_options->u.vpc, NULL, NULL,
   1149                                           NULL, &total_sectors, errp);
   1150        if (ret < 0) {
   1151            goto fail;
   1152        }
   1153
   1154        create_options->u.vpc.size = total_sectors * BDRV_SECTOR_SIZE;
   1155    }
   1156
   1157
   1158    /* Create the vpc image (format layer) */
   1159    ret = vpc_co_create(create_options, errp);
   1160
   1161fail:
   1162    qobject_unref(qdict);
   1163    bdrv_unref(bs);
   1164    qapi_free_BlockdevCreateOptions(create_options);
   1165    return ret;
   1166}
   1167
   1168
   1169static int vpc_has_zero_init(BlockDriverState *bs)
   1170{
   1171    BDRVVPCState *s = bs->opaque;
   1172
   1173    if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
   1174        return bdrv_has_zero_init(bs->file->bs);
   1175    } else {
   1176        return 1;
   1177    }
   1178}
   1179
   1180static void vpc_close(BlockDriverState *bs)
   1181{
   1182    BDRVVPCState *s = bs->opaque;
   1183    qemu_vfree(s->pagetable);
   1184#ifdef CACHE
   1185    g_free(s->pageentry_u8);
   1186#endif
   1187
   1188    migrate_del_blocker(s->migration_blocker);
   1189    error_free(s->migration_blocker);
   1190}
   1191
   1192static QemuOptsList vpc_create_opts = {
   1193    .name = "vpc-create-opts",
   1194    .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
   1195    .desc = {
   1196        {
   1197            .name = BLOCK_OPT_SIZE,
   1198            .type = QEMU_OPT_SIZE,
   1199            .help = "Virtual disk size"
   1200        },
   1201        {
   1202            .name = BLOCK_OPT_SUBFMT,
   1203            .type = QEMU_OPT_STRING,
   1204            .help =
   1205                "Type of virtual hard disk format. Supported formats are "
   1206                "{dynamic (default) | fixed} "
   1207        },
   1208        {
   1209            .name = VPC_OPT_FORCE_SIZE,
   1210            .type = QEMU_OPT_BOOL,
   1211            .help = "Force disk size calculation to use the actual size "
   1212                    "specified, rather than using the nearest CHS-based "
   1213                    "calculation"
   1214        },
   1215        { /* end of list */ }
   1216    }
   1217};
   1218
   1219static const char *const vpc_strong_runtime_opts[] = {
   1220    VPC_OPT_SIZE_CALC,
   1221
   1222    NULL
   1223};
   1224
   1225static BlockDriver bdrv_vpc = {
   1226    .format_name    = "vpc",
   1227    .instance_size  = sizeof(BDRVVPCState),
   1228
   1229    .bdrv_probe             = vpc_probe,
   1230    .bdrv_open              = vpc_open,
   1231    .bdrv_close             = vpc_close,
   1232    .bdrv_reopen_prepare    = vpc_reopen_prepare,
   1233    .bdrv_child_perm        = bdrv_default_perms,
   1234    .bdrv_co_create         = vpc_co_create,
   1235    .bdrv_co_create_opts    = vpc_co_create_opts,
   1236
   1237    .bdrv_co_preadv             = vpc_co_preadv,
   1238    .bdrv_co_pwritev            = vpc_co_pwritev,
   1239    .bdrv_co_block_status       = vpc_co_block_status,
   1240
   1241    .bdrv_get_info          = vpc_get_info,
   1242
   1243    .is_format              = true,
   1244    .create_opts            = &vpc_create_opts,
   1245    .bdrv_has_zero_init     = vpc_has_zero_init,
   1246    .strong_runtime_opts    = vpc_strong_runtime_opts,
   1247};
   1248
   1249static void bdrv_vpc_init(void)
   1250{
   1251    bdrv_register(&bdrv_vpc);
   1252}
   1253
   1254block_init(bdrv_vpc_init);