cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

block.c (29655B)


      1/*
      2 * QEMU live block migration
      3 *
      4 * Copyright IBM, Corp. 2009
      5 *
      6 * Authors:
      7 *  Liran Schour   <lirans@il.ibm.com>
      8 *
      9 * This work is licensed under the terms of the GNU GPL, version 2.  See
     10 * the COPYING file in the top-level directory.
     11 *
     12 * Contributions after 2012-01-13 are licensed under the terms of the
     13 * GNU GPL, version 2 or (at your option) any later version.
     14 */
     15
     16#include "qemu/osdep.h"
     17#include "qapi/error.h"
     18#include "qemu/error-report.h"
     19#include "qemu/main-loop.h"
     20#include "qemu/cutils.h"
     21#include "qemu/queue.h"
     22#include "block.h"
     23#include "migration/misc.h"
     24#include "migration.h"
     25#include "migration/register.h"
     26#include "qemu-file.h"
     27#include "migration/vmstate.h"
     28#include "sysemu/block-backend.h"
     29#include "trace.h"
     30
     31#define BLK_MIG_BLOCK_SIZE           (1 << 20)
     32#define BDRV_SECTORS_PER_DIRTY_CHUNK (BLK_MIG_BLOCK_SIZE >> BDRV_SECTOR_BITS)
     33
     34#define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
     35#define BLK_MIG_FLAG_EOS                0x02
     36#define BLK_MIG_FLAG_PROGRESS           0x04
     37#define BLK_MIG_FLAG_ZERO_BLOCK         0x08
     38
     39#define MAX_IS_ALLOCATED_SEARCH (65536 * BDRV_SECTOR_SIZE)
     40
     41#define MAX_IO_BUFFERS 512
     42#define MAX_PARALLEL_IO 16
     43
     44/* #define DEBUG_BLK_MIGRATION */
     45
     46#ifdef DEBUG_BLK_MIGRATION
     47#define DPRINTF(fmt, ...) \
     48    do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
     49#else
     50#define DPRINTF(fmt, ...) \
     51    do { } while (0)
     52#endif
     53
     54typedef struct BlkMigDevState {
     55    /* Written during setup phase.  Can be read without a lock.  */
     56    BlockBackend *blk;
     57    char *blk_name;
     58    int shared_base;
     59    int64_t total_sectors;
     60    QSIMPLEQ_ENTRY(BlkMigDevState) entry;
     61    Error *blocker;
     62
     63    /* Only used by migration thread.  Does not need a lock.  */
     64    int bulk_completed;
     65    int64_t cur_sector;
     66    int64_t cur_dirty;
     67
     68    /* Data in the aio_bitmap is protected by block migration lock.
     69     * Allocation and free happen during setup and cleanup respectively.
     70     */
     71    unsigned long *aio_bitmap;
     72
     73    /* Protected by block migration lock.  */
     74    int64_t completed_sectors;
     75
     76    /* During migration this is protected by iothread lock / AioContext.
     77     * Allocation and free happen during setup and cleanup respectively.
     78     */
     79    BdrvDirtyBitmap *dirty_bitmap;
     80} BlkMigDevState;
     81
     82typedef struct BlkMigBlock {
     83    /* Only used by migration thread.  */
     84    uint8_t *buf;
     85    BlkMigDevState *bmds;
     86    int64_t sector;
     87    int nr_sectors;
     88    QEMUIOVector qiov;
     89    BlockAIOCB *aiocb;
     90
     91    /* Protected by block migration lock.  */
     92    int ret;
     93    QSIMPLEQ_ENTRY(BlkMigBlock) entry;
     94} BlkMigBlock;
     95
     96typedef struct BlkMigState {
     97    QSIMPLEQ_HEAD(, BlkMigDevState) bmds_list;
     98    int64_t total_sector_sum;
     99    bool zero_blocks;
    100
    101    /* Protected by lock.  */
    102    QSIMPLEQ_HEAD(, BlkMigBlock) blk_list;
    103    int submitted;
    104    int read_done;
    105
    106    /* Only used by migration thread.  Does not need a lock.  */
    107    int transferred;
    108    int prev_progress;
    109    int bulk_completed;
    110
    111    /* Lock must be taken _inside_ the iothread lock and any AioContexts.  */
    112    QemuMutex lock;
    113} BlkMigState;
    114
    115static BlkMigState block_mig_state;
    116
    117static void blk_mig_lock(void)
    118{
    119    qemu_mutex_lock(&block_mig_state.lock);
    120}
    121
    122static void blk_mig_unlock(void)
    123{
    124    qemu_mutex_unlock(&block_mig_state.lock);
    125}
    126
    127/* Must run outside of the iothread lock during the bulk phase,
    128 * or the VM will stall.
    129 */
    130
    131static void blk_send(QEMUFile *f, BlkMigBlock * blk)
    132{
    133    int len;
    134    uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;
    135
    136    if (block_mig_state.zero_blocks &&
    137        buffer_is_zero(blk->buf, BLK_MIG_BLOCK_SIZE)) {
    138        flags |= BLK_MIG_FLAG_ZERO_BLOCK;
    139    }
    140
    141    /* sector number and flags */
    142    qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
    143                     | flags);
    144
    145    /* device name */
    146    len = strlen(blk->bmds->blk_name);
    147    qemu_put_byte(f, len);
    148    qemu_put_buffer(f, (uint8_t *) blk->bmds->blk_name, len);
    149
    150    /* if a block is zero we need to flush here since the network
    151     * bandwidth is now a lot higher than the storage device bandwidth.
    152     * thus if we queue zero blocks we slow down the migration */
    153    if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
    154        qemu_fflush(f);
    155        return;
    156    }
    157
    158    qemu_put_buffer(f, blk->buf, BLK_MIG_BLOCK_SIZE);
    159}
    160
    161int blk_mig_active(void)
    162{
    163    return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
    164}
    165
    166int blk_mig_bulk_active(void)
    167{
    168    return blk_mig_active() && !block_mig_state.bulk_completed;
    169}
    170
    171uint64_t blk_mig_bytes_transferred(void)
    172{
    173    BlkMigDevState *bmds;
    174    uint64_t sum = 0;
    175
    176    blk_mig_lock();
    177    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    178        sum += bmds->completed_sectors;
    179    }
    180    blk_mig_unlock();
    181    return sum << BDRV_SECTOR_BITS;
    182}
    183
    184uint64_t blk_mig_bytes_remaining(void)
    185{
    186    return blk_mig_bytes_total() - blk_mig_bytes_transferred();
    187}
    188
    189uint64_t blk_mig_bytes_total(void)
    190{
    191    BlkMigDevState *bmds;
    192    uint64_t sum = 0;
    193
    194    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    195        sum += bmds->total_sectors;
    196    }
    197    return sum << BDRV_SECTOR_BITS;
    198}
    199
    200
    201/* Called with migration lock held.  */
    202
    203static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
    204{
    205    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
    206
    207    if (sector < blk_nb_sectors(bmds->blk)) {
    208        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
    209            (1UL << (chunk % (sizeof(unsigned long) * 8))));
    210    } else {
    211        return 0;
    212    }
    213}
    214
    215/* Called with migration lock held.  */
    216
    217static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
    218                             int nb_sectors, int set)
    219{
    220    int64_t start, end;
    221    unsigned long val, idx, bit;
    222
    223    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
    224    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
    225
    226    for (; start <= end; start++) {
    227        idx = start / (sizeof(unsigned long) * 8);
    228        bit = start % (sizeof(unsigned long) * 8);
    229        val = bmds->aio_bitmap[idx];
    230        if (set) {
    231            val |= 1UL << bit;
    232        } else {
    233            val &= ~(1UL << bit);
    234        }
    235        bmds->aio_bitmap[idx] = val;
    236    }
    237}
    238
    239static void alloc_aio_bitmap(BlkMigDevState *bmds)
    240{
    241    BlockBackend *bb = bmds->blk;
    242    int64_t bitmap_size;
    243
    244    bitmap_size = blk_nb_sectors(bb) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
    245    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
    246
    247    bmds->aio_bitmap = g_malloc0(bitmap_size);
    248}
    249
    250/* Never hold migration lock when yielding to the main loop!  */
    251
    252static void blk_mig_read_cb(void *opaque, int ret)
    253{
    254    BlkMigBlock *blk = opaque;
    255
    256    blk_mig_lock();
    257    blk->ret = ret;
    258
    259    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
    260    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
    261
    262    block_mig_state.submitted--;
    263    block_mig_state.read_done++;
    264    assert(block_mig_state.submitted >= 0);
    265    blk_mig_unlock();
    266}
    267
    268/* Called with no lock taken.  */
    269
    270static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
    271{
    272    int64_t total_sectors = bmds->total_sectors;
    273    int64_t cur_sector = bmds->cur_sector;
    274    BlockBackend *bb = bmds->blk;
    275    BlkMigBlock *blk;
    276    int nr_sectors;
    277    int64_t count;
    278
    279    if (bmds->shared_base) {
    280        qemu_mutex_lock_iothread();
    281        aio_context_acquire(blk_get_aio_context(bb));
    282        /* Skip unallocated sectors; intentionally treats failure or
    283         * partial sector as an allocated sector */
    284        while (cur_sector < total_sectors &&
    285               !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE,
    286                                  MAX_IS_ALLOCATED_SEARCH, &count)) {
    287            if (count < BDRV_SECTOR_SIZE) {
    288                break;
    289            }
    290            cur_sector += count >> BDRV_SECTOR_BITS;
    291        }
    292        aio_context_release(blk_get_aio_context(bb));
    293        qemu_mutex_unlock_iothread();
    294    }
    295
    296    if (cur_sector >= total_sectors) {
    297        bmds->cur_sector = bmds->completed_sectors = total_sectors;
    298        return 1;
    299    }
    300
    301    bmds->completed_sectors = cur_sector;
    302
    303    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
    304
    305    /* we are going to transfer a full block even if it is not allocated */
    306    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
    307
    308    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
    309        nr_sectors = total_sectors - cur_sector;
    310    }
    311
    312    blk = g_new(BlkMigBlock, 1);
    313    blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE);
    314    blk->bmds = bmds;
    315    blk->sector = cur_sector;
    316    blk->nr_sectors = nr_sectors;
    317
    318    qemu_iovec_init_buf(&blk->qiov, blk->buf, nr_sectors * BDRV_SECTOR_SIZE);
    319
    320    blk_mig_lock();
    321    block_mig_state.submitted++;
    322    blk_mig_unlock();
    323
    324    /* We do not know if bs is under the main thread (and thus does
    325     * not acquire the AioContext when doing AIO) or rather under
    326     * dataplane.  Thus acquire both the iothread mutex and the
    327     * AioContext.
    328     *
    329     * This is ugly and will disappear when we make bdrv_* thread-safe,
    330     * without the need to acquire the AioContext.
    331     */
    332    qemu_mutex_lock_iothread();
    333    aio_context_acquire(blk_get_aio_context(bmds->blk));
    334    bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector * BDRV_SECTOR_SIZE,
    335                            nr_sectors * BDRV_SECTOR_SIZE);
    336    blk->aiocb = blk_aio_preadv(bb, cur_sector * BDRV_SECTOR_SIZE, &blk->qiov,
    337                                0, blk_mig_read_cb, blk);
    338    aio_context_release(blk_get_aio_context(bmds->blk));
    339    qemu_mutex_unlock_iothread();
    340
    341    bmds->cur_sector = cur_sector + nr_sectors;
    342    return (bmds->cur_sector >= total_sectors);
    343}
    344
    345/* Called with iothread lock taken.  */
    346
    347static int set_dirty_tracking(void)
    348{
    349    BlkMigDevState *bmds;
    350    int ret;
    351
    352    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    353        bmds->dirty_bitmap = bdrv_create_dirty_bitmap(blk_bs(bmds->blk),
    354                                                      BLK_MIG_BLOCK_SIZE,
    355                                                      NULL, NULL);
    356        if (!bmds->dirty_bitmap) {
    357            ret = -errno;
    358            goto fail;
    359        }
    360    }
    361    return 0;
    362
    363fail:
    364    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    365        if (bmds->dirty_bitmap) {
    366            bdrv_release_dirty_bitmap(bmds->dirty_bitmap);
    367        }
    368    }
    369    return ret;
    370}
    371
    372/* Called with iothread lock taken.  */
    373
    374static void unset_dirty_tracking(void)
    375{
    376    BlkMigDevState *bmds;
    377
    378    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    379        bdrv_release_dirty_bitmap(bmds->dirty_bitmap);
    380    }
    381}
    382
    383static int init_blk_migration(QEMUFile *f)
    384{
    385    BlockDriverState *bs;
    386    BlkMigDevState *bmds;
    387    int64_t sectors;
    388    BdrvNextIterator it;
    389    int i, num_bs = 0;
    390    struct {
    391        BlkMigDevState *bmds;
    392        BlockDriverState *bs;
    393    } *bmds_bs;
    394    Error *local_err = NULL;
    395    int ret;
    396
    397    block_mig_state.submitted = 0;
    398    block_mig_state.read_done = 0;
    399    block_mig_state.transferred = 0;
    400    block_mig_state.total_sector_sum = 0;
    401    block_mig_state.prev_progress = -1;
    402    block_mig_state.bulk_completed = 0;
    403    block_mig_state.zero_blocks = migrate_zero_blocks();
    404
    405    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
    406        num_bs++;
    407    }
    408    bmds_bs = g_malloc0(num_bs * sizeof(*bmds_bs));
    409
    410    for (i = 0, bs = bdrv_first(&it); bs; bs = bdrv_next(&it), i++) {
    411        if (bdrv_is_read_only(bs)) {
    412            continue;
    413        }
    414
    415        sectors = bdrv_nb_sectors(bs);
    416        if (sectors <= 0) {
    417            ret = sectors;
    418            bdrv_next_cleanup(&it);
    419            goto out;
    420        }
    421
    422        bmds = g_new0(BlkMigDevState, 1);
    423        bmds->blk = blk_new(qemu_get_aio_context(),
    424                            BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
    425        bmds->blk_name = g_strdup(bdrv_get_device_name(bs));
    426        bmds->bulk_completed = 0;
    427        bmds->total_sectors = sectors;
    428        bmds->completed_sectors = 0;
    429        bmds->shared_base = migrate_use_block_incremental();
    430
    431        assert(i < num_bs);
    432        bmds_bs[i].bmds = bmds;
    433        bmds_bs[i].bs = bs;
    434
    435        block_mig_state.total_sector_sum += sectors;
    436
    437        if (bmds->shared_base) {
    438            trace_migration_block_init_shared(bdrv_get_device_name(bs));
    439        } else {
    440            trace_migration_block_init_full(bdrv_get_device_name(bs));
    441        }
    442
    443        QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
    444    }
    445
    446    /* Can only insert new BDSes now because doing so while iterating block
    447     * devices may end up in a deadlock (iterating the new BDSes, too). */
    448    for (i = 0; i < num_bs; i++) {
    449        BlkMigDevState *bmds = bmds_bs[i].bmds;
    450        BlockDriverState *bs = bmds_bs[i].bs;
    451
    452        if (bmds) {
    453            ret = blk_insert_bs(bmds->blk, bs, &local_err);
    454            if (ret < 0) {
    455                error_report_err(local_err);
    456                goto out;
    457            }
    458
    459            alloc_aio_bitmap(bmds);
    460            error_setg(&bmds->blocker, "block device is in use by migration");
    461            bdrv_op_block_all(bs, bmds->blocker);
    462        }
    463    }
    464
    465    ret = 0;
    466out:
    467    g_free(bmds_bs);
    468    return ret;
    469}
    470
    471/* Called with no lock taken.  */
    472
    473static int blk_mig_save_bulked_block(QEMUFile *f)
    474{
    475    int64_t completed_sector_sum = 0;
    476    BlkMigDevState *bmds;
    477    int progress;
    478    int ret = 0;
    479
    480    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    481        if (bmds->bulk_completed == 0) {
    482            if (mig_save_device_bulk(f, bmds) == 1) {
    483                /* completed bulk section for this device */
    484                bmds->bulk_completed = 1;
    485            }
    486            completed_sector_sum += bmds->completed_sectors;
    487            ret = 1;
    488            break;
    489        } else {
    490            completed_sector_sum += bmds->completed_sectors;
    491        }
    492    }
    493
    494    if (block_mig_state.total_sector_sum != 0) {
    495        progress = completed_sector_sum * 100 /
    496                   block_mig_state.total_sector_sum;
    497    } else {
    498        progress = 100;
    499    }
    500    if (progress != block_mig_state.prev_progress) {
    501        block_mig_state.prev_progress = progress;
    502        qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
    503                         | BLK_MIG_FLAG_PROGRESS);
    504        DPRINTF("Completed %d %%\r", progress);
    505    }
    506
    507    return ret;
    508}
    509
    510static void blk_mig_reset_dirty_cursor(void)
    511{
    512    BlkMigDevState *bmds;
    513
    514    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    515        bmds->cur_dirty = 0;
    516    }
    517}
    518
    519/* Called with iothread lock and AioContext taken.  */
    520
    521static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
    522                                 int is_async)
    523{
    524    BlkMigBlock *blk;
    525    int64_t total_sectors = bmds->total_sectors;
    526    int64_t sector;
    527    int nr_sectors;
    528    int ret = -EIO;
    529
    530    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
    531        blk_mig_lock();
    532        if (bmds_aio_inflight(bmds, sector)) {
    533            blk_mig_unlock();
    534            blk_drain(bmds->blk);
    535        } else {
    536            blk_mig_unlock();
    537        }
    538        bdrv_dirty_bitmap_lock(bmds->dirty_bitmap);
    539        if (bdrv_dirty_bitmap_get_locked(bmds->dirty_bitmap,
    540                                         sector * BDRV_SECTOR_SIZE)) {
    541            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
    542                nr_sectors = total_sectors - sector;
    543            } else {
    544                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
    545            }
    546            bdrv_reset_dirty_bitmap_locked(bmds->dirty_bitmap,
    547                                           sector * BDRV_SECTOR_SIZE,
    548                                           nr_sectors * BDRV_SECTOR_SIZE);
    549            bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap);
    550
    551            blk = g_new(BlkMigBlock, 1);
    552            blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE);
    553            blk->bmds = bmds;
    554            blk->sector = sector;
    555            blk->nr_sectors = nr_sectors;
    556
    557            if (is_async) {
    558                qemu_iovec_init_buf(&blk->qiov, blk->buf,
    559                                    nr_sectors * BDRV_SECTOR_SIZE);
    560
    561                blk->aiocb = blk_aio_preadv(bmds->blk,
    562                                            sector * BDRV_SECTOR_SIZE,
    563                                            &blk->qiov, 0, blk_mig_read_cb,
    564                                            blk);
    565
    566                blk_mig_lock();
    567                block_mig_state.submitted++;
    568                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
    569                blk_mig_unlock();
    570            } else {
    571                ret = blk_pread(bmds->blk, sector * BDRV_SECTOR_SIZE, blk->buf,
    572                                nr_sectors * BDRV_SECTOR_SIZE);
    573                if (ret < 0) {
    574                    goto error;
    575                }
    576                blk_send(f, blk);
    577
    578                g_free(blk->buf);
    579                g_free(blk);
    580            }
    581
    582            sector += nr_sectors;
    583            bmds->cur_dirty = sector;
    584            break;
    585        }
    586
    587        bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap);
    588        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
    589        bmds->cur_dirty = sector;
    590    }
    591
    592    return (bmds->cur_dirty >= bmds->total_sectors);
    593
    594error:
    595    trace_migration_block_save_device_dirty(sector);
    596    g_free(blk->buf);
    597    g_free(blk);
    598    return ret;
    599}
    600
    601/* Called with iothread lock taken.
    602 *
    603 * return value:
    604 * 0: too much data for max_downtime
    605 * 1: few enough data for max_downtime
    606*/
    607static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
    608{
    609    BlkMigDevState *bmds;
    610    int ret = 1;
    611
    612    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    613        aio_context_acquire(blk_get_aio_context(bmds->blk));
    614        ret = mig_save_device_dirty(f, bmds, is_async);
    615        aio_context_release(blk_get_aio_context(bmds->blk));
    616        if (ret <= 0) {
    617            break;
    618        }
    619    }
    620
    621    return ret;
    622}
    623
    624/* Called with no locks taken.  */
    625
    626static int flush_blks(QEMUFile *f)
    627{
    628    BlkMigBlock *blk;
    629    int ret = 0;
    630
    631    trace_migration_block_flush_blks("Enter", block_mig_state.submitted,
    632                                     block_mig_state.read_done,
    633                                     block_mig_state.transferred);
    634
    635    blk_mig_lock();
    636    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
    637        if (qemu_file_rate_limit(f)) {
    638            break;
    639        }
    640        if (blk->ret < 0) {
    641            ret = blk->ret;
    642            break;
    643        }
    644
    645        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
    646        blk_mig_unlock();
    647        blk_send(f, blk);
    648        blk_mig_lock();
    649
    650        g_free(blk->buf);
    651        g_free(blk);
    652
    653        block_mig_state.read_done--;
    654        block_mig_state.transferred++;
    655        assert(block_mig_state.read_done >= 0);
    656    }
    657    blk_mig_unlock();
    658
    659    trace_migration_block_flush_blks("Exit", block_mig_state.submitted,
    660                                     block_mig_state.read_done,
    661                                     block_mig_state.transferred);
    662    return ret;
    663}
    664
    665/* Called with iothread lock taken.  */
    666
    667static int64_t get_remaining_dirty(void)
    668{
    669    BlkMigDevState *bmds;
    670    int64_t dirty = 0;
    671
    672    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    673        aio_context_acquire(blk_get_aio_context(bmds->blk));
    674        dirty += bdrv_get_dirty_count(bmds->dirty_bitmap);
    675        aio_context_release(blk_get_aio_context(bmds->blk));
    676    }
    677
    678    return dirty;
    679}
    680
    681
    682
    683/* Called with iothread lock taken.  */
    684static void block_migration_cleanup_bmds(void)
    685{
    686    BlkMigDevState *bmds;
    687    AioContext *ctx;
    688
    689    unset_dirty_tracking();
    690
    691    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
    692        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
    693        bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker);
    694        error_free(bmds->blocker);
    695
    696        /* Save ctx, because bmds->blk can disappear during blk_unref.  */
    697        ctx = blk_get_aio_context(bmds->blk);
    698        aio_context_acquire(ctx);
    699        blk_unref(bmds->blk);
    700        aio_context_release(ctx);
    701
    702        g_free(bmds->blk_name);
    703        g_free(bmds->aio_bitmap);
    704        g_free(bmds);
    705    }
    706}
    707
    708/* Called with iothread lock taken.  */
    709static void block_migration_cleanup(void *opaque)
    710{
    711    BlkMigBlock *blk;
    712
    713    bdrv_drain_all();
    714
    715    block_migration_cleanup_bmds();
    716
    717    blk_mig_lock();
    718    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
    719        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
    720        g_free(blk->buf);
    721        g_free(blk);
    722    }
    723    blk_mig_unlock();
    724}
    725
    726static int block_save_setup(QEMUFile *f, void *opaque)
    727{
    728    int ret;
    729
    730    trace_migration_block_save("setup", block_mig_state.submitted,
    731                               block_mig_state.transferred);
    732
    733    qemu_mutex_lock_iothread();
    734    ret = init_blk_migration(f);
    735    if (ret < 0) {
    736        qemu_mutex_unlock_iothread();
    737        return ret;
    738    }
    739
    740    /* start track dirty blocks */
    741    ret = set_dirty_tracking();
    742
    743    qemu_mutex_unlock_iothread();
    744
    745    if (ret) {
    746        return ret;
    747    }
    748
    749    ret = flush_blks(f);
    750    blk_mig_reset_dirty_cursor();
    751    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
    752
    753    return ret;
    754}
    755
    756static int block_save_iterate(QEMUFile *f, void *opaque)
    757{
    758    int ret;
    759    int64_t last_ftell = qemu_ftell(f);
    760    int64_t delta_ftell;
    761
    762    trace_migration_block_save("iterate", block_mig_state.submitted,
    763                               block_mig_state.transferred);
    764
    765    ret = flush_blks(f);
    766    if (ret) {
    767        return ret;
    768    }
    769
    770    blk_mig_reset_dirty_cursor();
    771
    772    /* control the rate of transfer */
    773    blk_mig_lock();
    774    while (block_mig_state.read_done * BLK_MIG_BLOCK_SIZE <
    775           qemu_file_get_rate_limit(f) &&
    776           block_mig_state.submitted < MAX_PARALLEL_IO &&
    777           (block_mig_state.submitted + block_mig_state.read_done) <
    778           MAX_IO_BUFFERS) {
    779        blk_mig_unlock();
    780        if (block_mig_state.bulk_completed == 0) {
    781            /* first finish the bulk phase */
    782            if (blk_mig_save_bulked_block(f) == 0) {
    783                /* finished saving bulk on all devices */
    784                block_mig_state.bulk_completed = 1;
    785            }
    786            ret = 0;
    787        } else {
    788            /* Always called with iothread lock taken for
    789             * simplicity, block_save_complete also calls it.
    790             */
    791            qemu_mutex_lock_iothread();
    792            ret = blk_mig_save_dirty_block(f, 1);
    793            qemu_mutex_unlock_iothread();
    794        }
    795        if (ret < 0) {
    796            return ret;
    797        }
    798        blk_mig_lock();
    799        if (ret != 0) {
    800            /* no more dirty blocks */
    801            break;
    802        }
    803    }
    804    blk_mig_unlock();
    805
    806    ret = flush_blks(f);
    807    if (ret) {
    808        return ret;
    809    }
    810
    811    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
    812    delta_ftell = qemu_ftell(f) - last_ftell;
    813    if (delta_ftell > 0) {
    814        return 1;
    815    } else if (delta_ftell < 0) {
    816        return -1;
    817    } else {
    818        return 0;
    819    }
    820}
    821
    822/* Called with iothread lock taken.  */
    823
    824static int block_save_complete(QEMUFile *f, void *opaque)
    825{
    826    int ret;
    827
    828    trace_migration_block_save("complete", block_mig_state.submitted,
    829                               block_mig_state.transferred);
    830
    831    ret = flush_blks(f);
    832    if (ret) {
    833        return ret;
    834    }
    835
    836    blk_mig_reset_dirty_cursor();
    837
    838    /* we know for sure that save bulk is completed and
    839       all async read completed */
    840    blk_mig_lock();
    841    assert(block_mig_state.submitted == 0);
    842    blk_mig_unlock();
    843
    844    do {
    845        ret = blk_mig_save_dirty_block(f, 0);
    846        if (ret < 0) {
    847            return ret;
    848        }
    849    } while (ret == 0);
    850
    851    /* report completion */
    852    qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
    853
    854    trace_migration_block_save_complete();
    855
    856    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
    857
    858    /* Make sure that our BlockBackends are gone, so that the block driver
    859     * nodes can be inactivated. */
    860    block_migration_cleanup_bmds();
    861
    862    return 0;
    863}
    864
    865static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
    866                               uint64_t *res_precopy_only,
    867                               uint64_t *res_compatible,
    868                               uint64_t *res_postcopy_only)
    869{
    870    /* Estimate pending number of bytes to send */
    871    uint64_t pending;
    872
    873    qemu_mutex_lock_iothread();
    874    pending = get_remaining_dirty();
    875    qemu_mutex_unlock_iothread();
    876
    877    blk_mig_lock();
    878    pending += block_mig_state.submitted * BLK_MIG_BLOCK_SIZE +
    879               block_mig_state.read_done * BLK_MIG_BLOCK_SIZE;
    880    blk_mig_unlock();
    881
    882    /* Report at least one block pending during bulk phase */
    883    if (pending <= max_size && !block_mig_state.bulk_completed) {
    884        pending = max_size + BLK_MIG_BLOCK_SIZE;
    885    }
    886
    887    trace_migration_block_save_pending(pending);
    888    /* We don't do postcopy */
    889    *res_precopy_only += pending;
    890}
    891
    892static int block_load(QEMUFile *f, void *opaque, int version_id)
    893{
    894    static int banner_printed;
    895    int len, flags;
    896    char device_name[256];
    897    int64_t addr;
    898    BlockBackend *blk, *blk_prev = NULL;
    899    Error *local_err = NULL;
    900    uint8_t *buf;
    901    int64_t total_sectors = 0;
    902    int nr_sectors;
    903    int ret;
    904    BlockDriverInfo bdi;
    905    int cluster_size = BLK_MIG_BLOCK_SIZE;
    906
    907    do {
    908        addr = qemu_get_be64(f);
    909
    910        flags = addr & (BDRV_SECTOR_SIZE - 1);
    911        addr >>= BDRV_SECTOR_BITS;
    912
    913        if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
    914            /* get device name */
    915            len = qemu_get_byte(f);
    916            qemu_get_buffer(f, (uint8_t *)device_name, len);
    917            device_name[len] = '\0';
    918
    919            blk = blk_by_name(device_name);
    920            if (!blk) {
    921                fprintf(stderr, "Error unknown block device %s\n",
    922                        device_name);
    923                return -EINVAL;
    924            }
    925
    926            if (blk != blk_prev) {
    927                blk_prev = blk;
    928                total_sectors = blk_nb_sectors(blk);
    929                if (total_sectors <= 0) {
    930                    error_report("Error getting length of block device %s",
    931                                 device_name);
    932                    return -EINVAL;
    933                }
    934
    935                blk_invalidate_cache(blk, &local_err);
    936                if (local_err) {
    937                    error_report_err(local_err);
    938                    return -EINVAL;
    939                }
    940
    941                ret = bdrv_get_info(blk_bs(blk), &bdi);
    942                if (ret == 0 && bdi.cluster_size > 0 &&
    943                    bdi.cluster_size <= BLK_MIG_BLOCK_SIZE &&
    944                    BLK_MIG_BLOCK_SIZE % bdi.cluster_size == 0) {
    945                    cluster_size = bdi.cluster_size;
    946                } else {
    947                    cluster_size = BLK_MIG_BLOCK_SIZE;
    948                }
    949            }
    950
    951            if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
    952                nr_sectors = total_sectors - addr;
    953            } else {
    954                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
    955            }
    956
    957            if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
    958                ret = blk_pwrite_zeroes(blk, addr * BDRV_SECTOR_SIZE,
    959                                        nr_sectors * BDRV_SECTOR_SIZE,
    960                                        BDRV_REQ_MAY_UNMAP);
    961            } else {
    962                int i;
    963                int64_t cur_addr;
    964                uint8_t *cur_buf;
    965
    966                buf = g_malloc(BLK_MIG_BLOCK_SIZE);
    967                qemu_get_buffer(f, buf, BLK_MIG_BLOCK_SIZE);
    968                for (i = 0; i < BLK_MIG_BLOCK_SIZE / cluster_size; i++) {
    969                    cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size;
    970                    cur_buf = buf + i * cluster_size;
    971
    972                    if ((!block_mig_state.zero_blocks ||
    973                        cluster_size < BLK_MIG_BLOCK_SIZE) &&
    974                        buffer_is_zero(cur_buf, cluster_size)) {
    975                        ret = blk_pwrite_zeroes(blk, cur_addr,
    976                                                cluster_size,
    977                                                BDRV_REQ_MAY_UNMAP);
    978                    } else {
    979                        ret = blk_pwrite(blk, cur_addr, cur_buf,
    980                                         cluster_size, 0);
    981                    }
    982                    if (ret < 0) {
    983                        break;
    984                    }
    985                }
    986                g_free(buf);
    987            }
    988
    989            if (ret < 0) {
    990                return ret;
    991            }
    992        } else if (flags & BLK_MIG_FLAG_PROGRESS) {
    993            if (!banner_printed) {
    994                printf("Receiving block device images\n");
    995                banner_printed = 1;
    996            }
    997            printf("Completed %d %%%c", (int)addr,
    998                   (addr == 100) ? '\n' : '\r');
    999            fflush(stdout);
   1000        } else if (!(flags & BLK_MIG_FLAG_EOS)) {
   1001            fprintf(stderr, "Unknown block migration flags: 0x%x\n", flags);
   1002            return -EINVAL;
   1003        }
   1004        ret = qemu_file_get_error(f);
   1005        if (ret != 0) {
   1006            return ret;
   1007        }
   1008    } while (!(flags & BLK_MIG_FLAG_EOS));
   1009
   1010    return 0;
   1011}
   1012
   1013static bool block_is_active(void *opaque)
   1014{
   1015    return migrate_use_block();
   1016}
   1017
   1018static SaveVMHandlers savevm_block_handlers = {
   1019    .save_setup = block_save_setup,
   1020    .save_live_iterate = block_save_iterate,
   1021    .save_live_complete_precopy = block_save_complete,
   1022    .save_live_pending = block_save_pending,
   1023    .load_state = block_load,
   1024    .save_cleanup = block_migration_cleanup,
   1025    .is_active = block_is_active,
   1026};
   1027
   1028void blk_mig_init(void)
   1029{
   1030    QSIMPLEQ_INIT(&block_mig_state.bmds_list);
   1031    QSIMPLEQ_INIT(&block_mig_state.blk_list);
   1032    qemu_mutex_init(&block_mig_state.lock);
   1033
   1034    register_savevm_live("block", 0, 1, &savevm_block_handlers,
   1035                         &block_mig_state);
   1036}