cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

migration.c (136470B)


      1/*
      2 * QEMU live migration
      3 *
      4 * Copyright IBM, Corp. 2008
      5 *
      6 * Authors:
      7 *  Anthony Liguori   <aliguori@us.ibm.com>
      8 *
      9 * This work is licensed under the terms of the GNU GPL, version 2.  See
     10 * the COPYING file in the top-level directory.
     11 *
     12 * Contributions after 2012-01-13 are licensed under the terms of the
     13 * GNU GPL, version 2 or (at your option) any later version.
     14 */
     15
     16#include "qemu/osdep.h"
     17#include "qemu/cutils.h"
     18#include "qemu/error-report.h"
     19#include "qemu/main-loop.h"
     20#include "migration/blocker.h"
     21#include "exec.h"
     22#include "fd.h"
     23#include "socket.h"
     24#include "sysemu/runstate.h"
     25#include "sysemu/sysemu.h"
     26#include "sysemu/cpu-throttle.h"
     27#include "rdma.h"
     28#include "ram.h"
     29#include "migration/global_state.h"
     30#include "migration/misc.h"
     31#include "migration.h"
     32#include "savevm.h"
     33#include "qemu-file-channel.h"
     34#include "qemu-file.h"
     35#include "migration/vmstate.h"
     36#include "block/block.h"
     37#include "qapi/error.h"
     38#include "qapi/clone-visitor.h"
     39#include "qapi/qapi-visit-migration.h"
     40#include "qapi/qapi-visit-sockets.h"
     41#include "qapi/qapi-commands-migration.h"
     42#include "qapi/qapi-events-migration.h"
     43#include "qapi/qmp/qerror.h"
     44#include "qapi/qmp/qnull.h"
     45#include "qemu/rcu.h"
     46#include "block.h"
     47#include "postcopy-ram.h"
     48#include "qemu/thread.h"
     49#include "trace.h"
     50#include "exec/target_page.h"
     51#include "io/channel-buffer.h"
     52#include "migration/colo.h"
     53#include "hw/boards.h"
     54#include "hw/qdev-properties.h"
     55#include "hw/qdev-properties-system.h"
     56#include "monitor/monitor.h"
     57#include "net/announce.h"
     58#include "qemu/queue.h"
     59#include "multifd.h"
     60#include "qemu/yank.h"
     61#include "sysemu/cpus.h"
     62#include "yank_functions.h"
     63
     64#define MAX_THROTTLE  (128 << 20)      /* Migration transfer speed throttling */
     65
     66/* Amount of time to allocate to each "chunk" of bandwidth-throttled
     67 * data. */
     68#define BUFFER_DELAY     100
     69#define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY)
     70
     71/* Time in milliseconds we are allowed to stop the source,
     72 * for sending the last part */
     73#define DEFAULT_MIGRATE_SET_DOWNTIME 300
     74
     75/* Maximum migrate downtime set to 2000 seconds */
     76#define MAX_MIGRATE_DOWNTIME_SECONDS 2000
     77#define MAX_MIGRATE_DOWNTIME (MAX_MIGRATE_DOWNTIME_SECONDS * 1000)
     78
     79/* Default compression thread count */
     80#define DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT 8
     81/* Default decompression thread count, usually decompression is at
     82 * least 4 times as fast as compression.*/
     83#define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2
     84/*0: means nocompress, 1: best speed, ... 9: best compress ratio */
     85#define DEFAULT_MIGRATE_COMPRESS_LEVEL 1
     86/* Define default autoconverge cpu throttle migration parameters */
     87#define DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD 50
     88#define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20
     89#define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10
     90#define DEFAULT_MIGRATE_MAX_CPU_THROTTLE 99
     91
     92/* Migration XBZRLE default cache size */
     93#define DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE (64 * 1024 * 1024)
     94
     95/* The delay time (in ms) between two COLO checkpoints */
     96#define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY (200 * 100)
     97#define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
     98#define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE
     99/* 0: means nocompress, 1: best speed, ... 9: best compress ratio */
    100#define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1
    101/* 0: means nocompress, 1: best speed, ... 20: best compress ratio */
    102#define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1
    103
    104/* Background transfer rate for postcopy, 0 means unlimited, note
    105 * that page requests can still exceed this limit.
    106 */
    107#define DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH 0
    108
    109/*
    110 * Parameters for self_announce_delay giving a stream of RARP/ARP
    111 * packets after migration.
    112 */
    113#define DEFAULT_MIGRATE_ANNOUNCE_INITIAL  50
    114#define DEFAULT_MIGRATE_ANNOUNCE_MAX     550
    115#define DEFAULT_MIGRATE_ANNOUNCE_ROUNDS    5
    116#define DEFAULT_MIGRATE_ANNOUNCE_STEP    100
    117
    118static NotifierList migration_state_notifiers =
    119    NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
    120
    121/* Messages sent on the return path from destination to source */
    122enum mig_rp_message_type {
    123    MIG_RP_MSG_INVALID = 0,  /* Must be 0 */
    124    MIG_RP_MSG_SHUT,         /* sibling will not send any more RP messages */
    125    MIG_RP_MSG_PONG,         /* Response to a PING; data (seq: be32 ) */
    126
    127    MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */
    128    MIG_RP_MSG_REQ_PAGES,    /* data (start: be64, len: be32) */
    129    MIG_RP_MSG_RECV_BITMAP,  /* send recved_bitmap back to source */
    130    MIG_RP_MSG_RESUME_ACK,   /* tell source that we are ready to resume */
    131
    132    MIG_RP_MSG_MAX
    133};
    134
    135/* Migration capabilities set */
    136struct MigrateCapsSet {
    137    int size;                       /* Capability set size */
    138    MigrationCapability caps[];     /* Variadic array of capabilities */
    139};
    140typedef struct MigrateCapsSet MigrateCapsSet;
    141
    142/* Define and initialize MigrateCapsSet */
    143#define INITIALIZE_MIGRATE_CAPS_SET(_name, ...)   \
    144    MigrateCapsSet _name = {    \
    145        .size = sizeof((int []) { __VA_ARGS__ }) / sizeof(int), \
    146        .caps = { __VA_ARGS__ } \
    147    }
    148
    149/* Background-snapshot compatibility check list */
    150static const
    151INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot,
    152    MIGRATION_CAPABILITY_POSTCOPY_RAM,
    153    MIGRATION_CAPABILITY_DIRTY_BITMAPS,
    154    MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME,
    155    MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE,
    156    MIGRATION_CAPABILITY_RETURN_PATH,
    157    MIGRATION_CAPABILITY_MULTIFD,
    158    MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER,
    159    MIGRATION_CAPABILITY_AUTO_CONVERGE,
    160    MIGRATION_CAPABILITY_RELEASE_RAM,
    161    MIGRATION_CAPABILITY_RDMA_PIN_ALL,
    162    MIGRATION_CAPABILITY_COMPRESS,
    163    MIGRATION_CAPABILITY_XBZRLE,
    164    MIGRATION_CAPABILITY_X_COLO,
    165    MIGRATION_CAPABILITY_VALIDATE_UUID);
    166
    167/* When we add fault tolerance, we could have several
    168   migrations at once.  For now we don't need to add
    169   dynamic creation of migration */
    170
    171static MigrationState *current_migration;
    172static MigrationIncomingState *current_incoming;
    173
    174static GSList *migration_blockers;
    175
    176static bool migration_object_check(MigrationState *ms, Error **errp);
    177static int migration_maybe_pause(MigrationState *s,
    178                                 int *current_active_state,
    179                                 int new_state);
    180static void migrate_fd_cancel(MigrationState *s);
    181
    182static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp)
    183{
    184    uintptr_t a = (uintptr_t) ap, b = (uintptr_t) bp;
    185
    186    return (a > b) - (a < b);
    187}
    188
    189void migration_object_init(void)
    190{
    191    /* This can only be called once. */
    192    assert(!current_migration);
    193    current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION));
    194
    195    /*
    196     * Init the migrate incoming object as well no matter whether
    197     * we'll use it or not.
    198     */
    199    assert(!current_incoming);
    200    current_incoming = g_new0(MigrationIncomingState, 1);
    201    current_incoming->state = MIGRATION_STATUS_NONE;
    202    current_incoming->postcopy_remote_fds =
    203        g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD));
    204    qemu_mutex_init(&current_incoming->rp_mutex);
    205    qemu_event_init(&current_incoming->main_thread_load_event, false);
    206    qemu_sem_init(&current_incoming->postcopy_pause_sem_dst, 0);
    207    qemu_sem_init(&current_incoming->postcopy_pause_sem_fault, 0);
    208    qemu_mutex_init(&current_incoming->page_request_mutex);
    209    current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
    210
    211    migration_object_check(current_migration, &error_fatal);
    212
    213    blk_mig_init();
    214    ram_mig_init();
    215    dirty_bitmap_mig_init();
    216}
    217
    218void migration_cancel(void)
    219{
    220    migrate_fd_cancel(current_migration);
    221}
    222
    223void migration_shutdown(void)
    224{
    225    /*
    226     * Cancel the current migration - that will (eventually)
    227     * stop the migration using this structure
    228     */
    229    migration_cancel();
    230    object_unref(OBJECT(current_migration));
    231
    232    /*
    233     * Cancel outgoing migration of dirty bitmaps. It should
    234     * at least unref used block nodes.
    235     */
    236    dirty_bitmap_mig_cancel_outgoing();
    237
    238    /*
    239     * Cancel incoming migration of dirty bitmaps. Dirty bitmaps
    240     * are non-critical data, and their loss never considered as
    241     * something serious.
    242     */
    243    dirty_bitmap_mig_cancel_incoming();
    244}
    245
    246/* For outgoing */
    247MigrationState *migrate_get_current(void)
    248{
    249    /* This can only be called after the object created. */
    250    assert(current_migration);
    251    return current_migration;
    252}
    253
    254MigrationIncomingState *migration_incoming_get_current(void)
    255{
    256    assert(current_incoming);
    257    return current_incoming;
    258}
    259
    260void migration_incoming_state_destroy(void)
    261{
    262    struct MigrationIncomingState *mis = migration_incoming_get_current();
    263
    264    if (mis->to_src_file) {
    265        /* Tell source that we are done */
    266        migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
    267        qemu_fclose(mis->to_src_file);
    268        mis->to_src_file = NULL;
    269    }
    270
    271    if (mis->from_src_file) {
    272        migration_ioc_unregister_yank_from_file(mis->from_src_file);
    273        qemu_fclose(mis->from_src_file);
    274        mis->from_src_file = NULL;
    275    }
    276    if (mis->postcopy_remote_fds) {
    277        g_array_free(mis->postcopy_remote_fds, TRUE);
    278        mis->postcopy_remote_fds = NULL;
    279    }
    280    if (mis->transport_cleanup) {
    281        mis->transport_cleanup(mis->transport_data);
    282    }
    283
    284    qemu_event_reset(&mis->main_thread_load_event);
    285
    286    if (mis->page_requested) {
    287        g_tree_destroy(mis->page_requested);
    288        mis->page_requested = NULL;
    289    }
    290
    291    if (mis->socket_address_list) {
    292        qapi_free_SocketAddressList(mis->socket_address_list);
    293        mis->socket_address_list = NULL;
    294    }
    295
    296    yank_unregister_instance(MIGRATION_YANK_INSTANCE);
    297}
    298
    299static void migrate_generate_event(int new_state)
    300{
    301    if (migrate_use_events()) {
    302        qapi_event_send_migration(new_state);
    303    }
    304}
    305
    306static bool migrate_late_block_activate(void)
    307{
    308    MigrationState *s;
    309
    310    s = migrate_get_current();
    311
    312    return s->enabled_capabilities[
    313        MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE];
    314}
    315
    316/*
    317 * Send a message on the return channel back to the source
    318 * of the migration.
    319 */
    320static int migrate_send_rp_message(MigrationIncomingState *mis,
    321                                   enum mig_rp_message_type message_type,
    322                                   uint16_t len, void *data)
    323{
    324    int ret = 0;
    325
    326    trace_migrate_send_rp_message((int)message_type, len);
    327    QEMU_LOCK_GUARD(&mis->rp_mutex);
    328
    329    /*
    330     * It's possible that the file handle got lost due to network
    331     * failures.
    332     */
    333    if (!mis->to_src_file) {
    334        ret = -EIO;
    335        return ret;
    336    }
    337
    338    qemu_put_be16(mis->to_src_file, (unsigned int)message_type);
    339    qemu_put_be16(mis->to_src_file, len);
    340    qemu_put_buffer(mis->to_src_file, data, len);
    341    qemu_fflush(mis->to_src_file);
    342
    343    /* It's possible that qemu file got error during sending */
    344    ret = qemu_file_get_error(mis->to_src_file);
    345
    346    return ret;
    347}
    348
    349/* Request one page from the source VM at the given start address.
    350 *   rb: the RAMBlock to request the page in
    351 *   Start: Address offset within the RB
    352 *   Len: Length in bytes required - must be a multiple of pagesize
    353 */
    354int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
    355                                      RAMBlock *rb, ram_addr_t start)
    356{
    357    uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname up to 256 */
    358    size_t msglen = 12; /* start + len */
    359    size_t len = qemu_ram_pagesize(rb);
    360    enum mig_rp_message_type msg_type;
    361    const char *rbname;
    362    int rbname_len;
    363
    364    *(uint64_t *)bufc = cpu_to_be64((uint64_t)start);
    365    *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len);
    366
    367    /*
    368     * We maintain the last ramblock that we requested for page.  Note that we
    369     * don't need locking because this function will only be called within the
    370     * postcopy ram fault thread.
    371     */
    372    if (rb != mis->last_rb) {
    373        mis->last_rb = rb;
    374
    375        rbname = qemu_ram_get_idstr(rb);
    376        rbname_len = strlen(rbname);
    377
    378        assert(rbname_len < 256);
    379
    380        bufc[msglen++] = rbname_len;
    381        memcpy(bufc + msglen, rbname, rbname_len);
    382        msglen += rbname_len;
    383        msg_type = MIG_RP_MSG_REQ_PAGES_ID;
    384    } else {
    385        msg_type = MIG_RP_MSG_REQ_PAGES;
    386    }
    387
    388    return migrate_send_rp_message(mis, msg_type, msglen, bufc);
    389}
    390
    391int migrate_send_rp_req_pages(MigrationIncomingState *mis,
    392                              RAMBlock *rb, ram_addr_t start, uint64_t haddr)
    393{
    394    void *aligned = (void *)(uintptr_t)(haddr & (-qemu_ram_pagesize(rb)));
    395    bool received = false;
    396
    397    WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
    398        received = ramblock_recv_bitmap_test_byte_offset(rb, start);
    399        if (!received && !g_tree_lookup(mis->page_requested, aligned)) {
    400            /*
    401             * The page has not been received, and it's not yet in the page
    402             * request list.  Queue it.  Set the value of element to 1, so that
    403             * things like g_tree_lookup() will return TRUE (1) when found.
    404             */
    405            g_tree_insert(mis->page_requested, aligned, (gpointer)1);
    406            mis->page_requested_count++;
    407            trace_postcopy_page_req_add(aligned, mis->page_requested_count);
    408        }
    409    }
    410
    411    /*
    412     * If the page is there, skip sending the message.  We don't even need the
    413     * lock because as long as the page arrived, it'll be there forever.
    414     */
    415    if (received) {
    416        return 0;
    417    }
    418
    419    return migrate_send_rp_message_req_pages(mis, rb, start);
    420}
    421
    422static bool migration_colo_enabled;
    423bool migration_incoming_colo_enabled(void)
    424{
    425    return migration_colo_enabled;
    426}
    427
    428void migration_incoming_disable_colo(void)
    429{
    430    ram_block_discard_disable(false);
    431    migration_colo_enabled = false;
    432}
    433
    434int migration_incoming_enable_colo(void)
    435{
    436    if (ram_block_discard_disable(true)) {
    437        error_report("COLO: cannot disable RAM discard");
    438        return -EBUSY;
    439    }
    440    migration_colo_enabled = true;
    441    return 0;
    442}
    443
    444void migrate_add_address(SocketAddress *address)
    445{
    446    MigrationIncomingState *mis = migration_incoming_get_current();
    447
    448    QAPI_LIST_PREPEND(mis->socket_address_list,
    449                      QAPI_CLONE(SocketAddress, address));
    450}
    451
    452static void qemu_start_incoming_migration(const char *uri, Error **errp)
    453{
    454    const char *p = NULL;
    455
    456    qapi_event_send_migration(MIGRATION_STATUS_SETUP);
    457    if (strstart(uri, "tcp:", &p) ||
    458        strstart(uri, "unix:", NULL) ||
    459        strstart(uri, "vsock:", NULL)) {
    460        socket_start_incoming_migration(p ? p : uri, errp);
    461#ifdef CONFIG_RDMA
    462    } else if (strstart(uri, "rdma:", &p)) {
    463        rdma_start_incoming_migration(p, errp);
    464#endif
    465    } else if (strstart(uri, "exec:", &p)) {
    466        exec_start_incoming_migration(p, errp);
    467    } else if (strstart(uri, "fd:", &p)) {
    468        fd_start_incoming_migration(p, errp);
    469    } else {
    470        error_setg(errp, "unknown migration protocol: %s", uri);
    471    }
    472}
    473
    474static void process_incoming_migration_bh(void *opaque)
    475{
    476    Error *local_err = NULL;
    477    MigrationIncomingState *mis = opaque;
    478
    479    /* If capability late_block_activate is set:
    480     * Only fire up the block code now if we're going to restart the
    481     * VM, else 'cont' will do it.
    482     * This causes file locking to happen; so we don't want it to happen
    483     * unless we really are starting the VM.
    484     */
    485    if (!migrate_late_block_activate() ||
    486         (autostart && (!global_state_received() ||
    487            global_state_get_runstate() == RUN_STATE_RUNNING))) {
    488        /* Make sure all file formats flush their mutable metadata.
    489         * If we get an error here, just don't restart the VM yet. */
    490        bdrv_invalidate_cache_all(&local_err);
    491        if (local_err) {
    492            error_report_err(local_err);
    493            local_err = NULL;
    494            autostart = false;
    495        }
    496    }
    497
    498    /*
    499     * This must happen after all error conditions are dealt with and
    500     * we're sure the VM is going to be running on this host.
    501     */
    502    qemu_announce_self(&mis->announce_timer, migrate_announce_params());
    503
    504    if (multifd_load_cleanup(&local_err) != 0) {
    505        error_report_err(local_err);
    506        autostart = false;
    507    }
    508    /* If global state section was not received or we are in running
    509       state, we need to obey autostart. Any other state is set with
    510       runstate_set. */
    511
    512    dirty_bitmap_mig_before_vm_start();
    513
    514    if (!global_state_received() ||
    515        global_state_get_runstate() == RUN_STATE_RUNNING) {
    516        if (autostart) {
    517            vm_start();
    518        } else {
    519            runstate_set(RUN_STATE_PAUSED);
    520        }
    521    } else if (migration_incoming_colo_enabled()) {
    522        migration_incoming_disable_colo();
    523        vm_start();
    524    } else {
    525        runstate_set(global_state_get_runstate());
    526    }
    527    /*
    528     * This must happen after any state changes since as soon as an external
    529     * observer sees this event they might start to prod at the VM assuming
    530     * it's ready to use.
    531     */
    532    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
    533                      MIGRATION_STATUS_COMPLETED);
    534    qemu_bh_delete(mis->bh);
    535    migration_incoming_state_destroy();
    536}
    537
    538static void process_incoming_migration_co(void *opaque)
    539{
    540    MigrationIncomingState *mis = migration_incoming_get_current();
    541    PostcopyState ps;
    542    int ret;
    543    Error *local_err = NULL;
    544
    545    assert(mis->from_src_file);
    546    mis->migration_incoming_co = qemu_coroutine_self();
    547    mis->largest_page_size = qemu_ram_pagesize_largest();
    548    postcopy_state_set(POSTCOPY_INCOMING_NONE);
    549    migrate_set_state(&mis->state, MIGRATION_STATUS_NONE,
    550                      MIGRATION_STATUS_ACTIVE);
    551    ret = qemu_loadvm_state(mis->from_src_file);
    552
    553    ps = postcopy_state_get();
    554    trace_process_incoming_migration_co_end(ret, ps);
    555    if (ps != POSTCOPY_INCOMING_NONE) {
    556        if (ps == POSTCOPY_INCOMING_ADVISE) {
    557            /*
    558             * Where a migration had postcopy enabled (and thus went to advise)
    559             * but managed to complete within the precopy period, we can use
    560             * the normal exit.
    561             */
    562            postcopy_ram_incoming_cleanup(mis);
    563        } else if (ret >= 0) {
    564            /*
    565             * Postcopy was started, cleanup should happen at the end of the
    566             * postcopy thread.
    567             */
    568            trace_process_incoming_migration_co_postcopy_end_main();
    569            return;
    570        }
    571        /* Else if something went wrong then just fall out of the normal exit */
    572    }
    573
    574    /* we get COLO info, and know if we are in COLO mode */
    575    if (!ret && migration_incoming_colo_enabled()) {
    576        /* Make sure all file formats flush their mutable metadata */
    577        bdrv_invalidate_cache_all(&local_err);
    578        if (local_err) {
    579            error_report_err(local_err);
    580            goto fail;
    581        }
    582
    583        qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming",
    584             colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
    585        mis->have_colo_incoming_thread = true;
    586        qemu_coroutine_yield();
    587
    588        /* Wait checkpoint incoming thread exit before free resource */
    589        qemu_thread_join(&mis->colo_incoming_thread);
    590        /* We hold the global iothread lock, so it is safe here */
    591        colo_release_ram_cache();
    592    }
    593
    594    if (ret < 0) {
    595        error_report("load of migration failed: %s", strerror(-ret));
    596        goto fail;
    597    }
    598    mis->bh = qemu_bh_new(process_incoming_migration_bh, mis);
    599    qemu_bh_schedule(mis->bh);
    600    mis->migration_incoming_co = NULL;
    601    return;
    602fail:
    603    local_err = NULL;
    604    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
    605                      MIGRATION_STATUS_FAILED);
    606    qemu_fclose(mis->from_src_file);
    607    if (multifd_load_cleanup(&local_err) != 0) {
    608        error_report_err(local_err);
    609    }
    610    exit(EXIT_FAILURE);
    611}
    612
    613/**
    614 * migration_incoming_setup: Setup incoming migration
    615 * @f: file for main migration channel
    616 * @errp: where to put errors
    617 *
    618 * Returns: %true on success, %false on error.
    619 */
    620static bool migration_incoming_setup(QEMUFile *f, Error **errp)
    621{
    622    MigrationIncomingState *mis = migration_incoming_get_current();
    623
    624    if (multifd_load_setup(errp) != 0) {
    625        return false;
    626    }
    627
    628    if (!mis->from_src_file) {
    629        mis->from_src_file = f;
    630    }
    631    qemu_file_set_blocking(f, false);
    632    return true;
    633}
    634
    635void migration_incoming_process(void)
    636{
    637    Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
    638    qemu_coroutine_enter(co);
    639}
    640
    641/* Returns true if recovered from a paused migration, otherwise false */
    642static bool postcopy_try_recover(QEMUFile *f)
    643{
    644    MigrationIncomingState *mis = migration_incoming_get_current();
    645
    646    if (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
    647        /* Resumed from a paused postcopy migration */
    648
    649        mis->from_src_file = f;
    650        /* Postcopy has standalone thread to do vm load */
    651        qemu_file_set_blocking(f, true);
    652
    653        /* Re-configure the return path */
    654        mis->to_src_file = qemu_file_get_return_path(f);
    655
    656        migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
    657                          MIGRATION_STATUS_POSTCOPY_RECOVER);
    658
    659        /*
    660         * Here, we only wake up the main loading thread (while the
    661         * fault thread will still be waiting), so that we can receive
    662         * commands from source now, and answer it if needed. The
    663         * fault thread will be woken up afterwards until we are sure
    664         * that source is ready to reply to page requests.
    665         */
    666        qemu_sem_post(&mis->postcopy_pause_sem_dst);
    667        return true;
    668    }
    669
    670    return false;
    671}
    672
    673void migration_fd_process_incoming(QEMUFile *f, Error **errp)
    674{
    675    if (postcopy_try_recover(f)) {
    676        return;
    677    }
    678
    679    if (!migration_incoming_setup(f, errp)) {
    680        return;
    681    }
    682    migration_incoming_process();
    683}
    684
    685void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
    686{
    687    MigrationIncomingState *mis = migration_incoming_get_current();
    688    Error *local_err = NULL;
    689    bool start_migration;
    690
    691    if (!mis->from_src_file) {
    692        /* The first connection (multifd may have multiple) */
    693        QEMUFile *f = qemu_fopen_channel_input(ioc);
    694
    695        /* If it's a recovery, we're done */
    696        if (postcopy_try_recover(f)) {
    697            return;
    698        }
    699
    700        if (!migration_incoming_setup(f, errp)) {
    701            return;
    702        }
    703
    704        /*
    705         * Common migration only needs one channel, so we can start
    706         * right now.  Multifd needs more than one channel, we wait.
    707         */
    708        start_migration = !migrate_use_multifd();
    709    } else {
    710        /* Multiple connections */
    711        assert(migrate_use_multifd());
    712        start_migration = multifd_recv_new_channel(ioc, &local_err);
    713        if (local_err) {
    714            error_propagate(errp, local_err);
    715            return;
    716        }
    717    }
    718
    719    if (start_migration) {
    720        migration_incoming_process();
    721    }
    722}
    723
    724/**
    725 * @migration_has_all_channels: We have received all channels that we need
    726 *
    727 * Returns true when we have got connections to all the channels that
    728 * we need for migration.
    729 */
    730bool migration_has_all_channels(void)
    731{
    732    MigrationIncomingState *mis = migration_incoming_get_current();
    733    bool all_channels;
    734
    735    all_channels = multifd_recv_all_channels_created();
    736
    737    return all_channels && mis->from_src_file != NULL;
    738}
    739
    740/*
    741 * Send a 'SHUT' message on the return channel with the given value
    742 * to indicate that we've finished with the RP.  Non-0 value indicates
    743 * error.
    744 */
    745void migrate_send_rp_shut(MigrationIncomingState *mis,
    746                          uint32_t value)
    747{
    748    uint32_t buf;
    749
    750    buf = cpu_to_be32(value);
    751    migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf);
    752}
    753
    754/*
    755 * Send a 'PONG' message on the return channel with the given value
    756 * (normally in response to a 'PING')
    757 */
    758void migrate_send_rp_pong(MigrationIncomingState *mis,
    759                          uint32_t value)
    760{
    761    uint32_t buf;
    762
    763    buf = cpu_to_be32(value);
    764    migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
    765}
    766
    767void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
    768                                 char *block_name)
    769{
    770    char buf[512];
    771    int len;
    772    int64_t res;
    773
    774    /*
    775     * First, we send the header part. It contains only the len of
    776     * idstr, and the idstr itself.
    777     */
    778    len = strlen(block_name);
    779    buf[0] = len;
    780    memcpy(buf + 1, block_name, len);
    781
    782    if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
    783        error_report("%s: MSG_RP_RECV_BITMAP only used for recovery",
    784                     __func__);
    785        return;
    786    }
    787
    788    migrate_send_rp_message(mis, MIG_RP_MSG_RECV_BITMAP, len + 1, buf);
    789
    790    /*
    791     * Next, we dump the received bitmap to the stream.
    792     *
    793     * TODO: currently we are safe since we are the only one that is
    794     * using the to_src_file handle (fault thread is still paused),
    795     * and it's ok even not taking the mutex. However the best way is
    796     * to take the lock before sending the message header, and release
    797     * the lock after sending the bitmap.
    798     */
    799    qemu_mutex_lock(&mis->rp_mutex);
    800    res = ramblock_recv_bitmap_send(mis->to_src_file, block_name);
    801    qemu_mutex_unlock(&mis->rp_mutex);
    802
    803    trace_migrate_send_rp_recv_bitmap(block_name, res);
    804}
    805
    806void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
    807{
    808    uint32_t buf;
    809
    810    buf = cpu_to_be32(value);
    811    migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf);
    812}
    813
    814MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp)
    815{
    816    MigrationCapabilityStatusList *head = NULL, **tail = &head;
    817    MigrationCapabilityStatus *caps;
    818    MigrationState *s = migrate_get_current();
    819    int i;
    820
    821    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
    822#ifndef CONFIG_LIVE_BLOCK_MIGRATION
    823        if (i == MIGRATION_CAPABILITY_BLOCK) {
    824            continue;
    825        }
    826#endif
    827        caps = g_malloc0(sizeof(*caps));
    828        caps->capability = i;
    829        caps->state = s->enabled_capabilities[i];
    830        QAPI_LIST_APPEND(tail, caps);
    831    }
    832
    833    return head;
    834}
    835
    836MigrationParameters *qmp_query_migrate_parameters(Error **errp)
    837{
    838    MigrationParameters *params;
    839    MigrationState *s = migrate_get_current();
    840
    841    /* TODO use QAPI_CLONE() instead of duplicating it inline */
    842    params = g_malloc0(sizeof(*params));
    843    params->has_compress_level = true;
    844    params->compress_level = s->parameters.compress_level;
    845    params->has_compress_threads = true;
    846    params->compress_threads = s->parameters.compress_threads;
    847    params->has_compress_wait_thread = true;
    848    params->compress_wait_thread = s->parameters.compress_wait_thread;
    849    params->has_decompress_threads = true;
    850    params->decompress_threads = s->parameters.decompress_threads;
    851    params->has_throttle_trigger_threshold = true;
    852    params->throttle_trigger_threshold = s->parameters.throttle_trigger_threshold;
    853    params->has_cpu_throttle_initial = true;
    854    params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
    855    params->has_cpu_throttle_increment = true;
    856    params->cpu_throttle_increment = s->parameters.cpu_throttle_increment;
    857    params->has_cpu_throttle_tailslow = true;
    858    params->cpu_throttle_tailslow = s->parameters.cpu_throttle_tailslow;
    859    params->has_tls_creds = true;
    860    params->tls_creds = g_strdup(s->parameters.tls_creds);
    861    params->has_tls_hostname = true;
    862    params->tls_hostname = g_strdup(s->parameters.tls_hostname);
    863    params->has_tls_authz = true;
    864    params->tls_authz = g_strdup(s->parameters.tls_authz ?
    865                                 s->parameters.tls_authz : "");
    866    params->has_max_bandwidth = true;
    867    params->max_bandwidth = s->parameters.max_bandwidth;
    868    params->has_downtime_limit = true;
    869    params->downtime_limit = s->parameters.downtime_limit;
    870    params->has_x_checkpoint_delay = true;
    871    params->x_checkpoint_delay = s->parameters.x_checkpoint_delay;
    872    params->has_block_incremental = true;
    873    params->block_incremental = s->parameters.block_incremental;
    874    params->has_multifd_channels = true;
    875    params->multifd_channels = s->parameters.multifd_channels;
    876    params->has_multifd_compression = true;
    877    params->multifd_compression = s->parameters.multifd_compression;
    878    params->has_multifd_zlib_level = true;
    879    params->multifd_zlib_level = s->parameters.multifd_zlib_level;
    880    params->has_multifd_zstd_level = true;
    881    params->multifd_zstd_level = s->parameters.multifd_zstd_level;
    882    params->has_xbzrle_cache_size = true;
    883    params->xbzrle_cache_size = s->parameters.xbzrle_cache_size;
    884    params->has_max_postcopy_bandwidth = true;
    885    params->max_postcopy_bandwidth = s->parameters.max_postcopy_bandwidth;
    886    params->has_max_cpu_throttle = true;
    887    params->max_cpu_throttle = s->parameters.max_cpu_throttle;
    888    params->has_announce_initial = true;
    889    params->announce_initial = s->parameters.announce_initial;
    890    params->has_announce_max = true;
    891    params->announce_max = s->parameters.announce_max;
    892    params->has_announce_rounds = true;
    893    params->announce_rounds = s->parameters.announce_rounds;
    894    params->has_announce_step = true;
    895    params->announce_step = s->parameters.announce_step;
    896
    897    if (s->parameters.has_block_bitmap_mapping) {
    898        params->has_block_bitmap_mapping = true;
    899        params->block_bitmap_mapping =
    900            QAPI_CLONE(BitmapMigrationNodeAliasList,
    901                       s->parameters.block_bitmap_mapping);
    902    }
    903
    904    return params;
    905}
    906
    907AnnounceParameters *migrate_announce_params(void)
    908{
    909    static AnnounceParameters ap;
    910
    911    MigrationState *s = migrate_get_current();
    912
    913    ap.initial = s->parameters.announce_initial;
    914    ap.max = s->parameters.announce_max;
    915    ap.rounds = s->parameters.announce_rounds;
    916    ap.step = s->parameters.announce_step;
    917
    918    return &ap;
    919}
    920
    921/*
    922 * Return true if we're already in the middle of a migration
    923 * (i.e. any of the active or setup states)
    924 */
    925bool migration_is_setup_or_active(int state)
    926{
    927    switch (state) {
    928    case MIGRATION_STATUS_ACTIVE:
    929    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
    930    case MIGRATION_STATUS_POSTCOPY_PAUSED:
    931    case MIGRATION_STATUS_POSTCOPY_RECOVER:
    932    case MIGRATION_STATUS_SETUP:
    933    case MIGRATION_STATUS_PRE_SWITCHOVER:
    934    case MIGRATION_STATUS_DEVICE:
    935    case MIGRATION_STATUS_WAIT_UNPLUG:
    936    case MIGRATION_STATUS_COLO:
    937        return true;
    938
    939    default:
    940        return false;
    941
    942    }
    943}
    944
    945bool migration_is_running(int state)
    946{
    947    switch (state) {
    948    case MIGRATION_STATUS_ACTIVE:
    949    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
    950    case MIGRATION_STATUS_POSTCOPY_PAUSED:
    951    case MIGRATION_STATUS_POSTCOPY_RECOVER:
    952    case MIGRATION_STATUS_SETUP:
    953    case MIGRATION_STATUS_PRE_SWITCHOVER:
    954    case MIGRATION_STATUS_DEVICE:
    955    case MIGRATION_STATUS_WAIT_UNPLUG:
    956    case MIGRATION_STATUS_CANCELLING:
    957        return true;
    958
    959    default:
    960        return false;
    961
    962    }
    963}
    964
    965static void populate_time_info(MigrationInfo *info, MigrationState *s)
    966{
    967    info->has_status = true;
    968    info->has_setup_time = true;
    969    info->setup_time = s->setup_time;
    970    if (s->state == MIGRATION_STATUS_COMPLETED) {
    971        info->has_total_time = true;
    972        info->total_time = s->total_time;
    973        info->has_downtime = true;
    974        info->downtime = s->downtime;
    975    } else {
    976        info->has_total_time = true;
    977        info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
    978                           s->start_time;
    979        info->has_expected_downtime = true;
    980        info->expected_downtime = s->expected_downtime;
    981    }
    982}
    983
    984static void populate_ram_info(MigrationInfo *info, MigrationState *s)
    985{
    986    info->has_ram = true;
    987    info->ram = g_malloc0(sizeof(*info->ram));
    988    info->ram->transferred = ram_counters.transferred;
    989    info->ram->total = ram_bytes_total();
    990    info->ram->duplicate = ram_counters.duplicate;
    991    /* legacy value.  It is not used anymore */
    992    info->ram->skipped = 0;
    993    info->ram->normal = ram_counters.normal;
    994    info->ram->normal_bytes = ram_counters.normal *
    995        qemu_target_page_size();
    996    info->ram->mbps = s->mbps;
    997    info->ram->dirty_sync_count = ram_counters.dirty_sync_count;
    998    info->ram->postcopy_requests = ram_counters.postcopy_requests;
    999    info->ram->page_size = qemu_target_page_size();
   1000    info->ram->multifd_bytes = ram_counters.multifd_bytes;
   1001    info->ram->pages_per_second = s->pages_per_second;
   1002
   1003    if (migrate_use_xbzrle()) {
   1004        info->has_xbzrle_cache = true;
   1005        info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache));
   1006        info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size();
   1007        info->xbzrle_cache->bytes = xbzrle_counters.bytes;
   1008        info->xbzrle_cache->pages = xbzrle_counters.pages;
   1009        info->xbzrle_cache->cache_miss = xbzrle_counters.cache_miss;
   1010        info->xbzrle_cache->cache_miss_rate = xbzrle_counters.cache_miss_rate;
   1011        info->xbzrle_cache->encoding_rate = xbzrle_counters.encoding_rate;
   1012        info->xbzrle_cache->overflow = xbzrle_counters.overflow;
   1013    }
   1014
   1015    if (migrate_use_compression()) {
   1016        info->has_compression = true;
   1017        info->compression = g_malloc0(sizeof(*info->compression));
   1018        info->compression->pages = compression_counters.pages;
   1019        info->compression->busy = compression_counters.busy;
   1020        info->compression->busy_rate = compression_counters.busy_rate;
   1021        info->compression->compressed_size =
   1022                                    compression_counters.compressed_size;
   1023        info->compression->compression_rate =
   1024                                    compression_counters.compression_rate;
   1025    }
   1026
   1027    if (cpu_throttle_active()) {
   1028        info->has_cpu_throttle_percentage = true;
   1029        info->cpu_throttle_percentage = cpu_throttle_get_percentage();
   1030    }
   1031
   1032    if (s->state != MIGRATION_STATUS_COMPLETED) {
   1033        info->ram->remaining = ram_bytes_remaining();
   1034        info->ram->dirty_pages_rate = ram_counters.dirty_pages_rate;
   1035    }
   1036}
   1037
   1038static void populate_disk_info(MigrationInfo *info)
   1039{
   1040    if (blk_mig_active()) {
   1041        info->has_disk = true;
   1042        info->disk = g_malloc0(sizeof(*info->disk));
   1043        info->disk->transferred = blk_mig_bytes_transferred();
   1044        info->disk->remaining = blk_mig_bytes_remaining();
   1045        info->disk->total = blk_mig_bytes_total();
   1046    }
   1047}
   1048
   1049static void fill_source_migration_info(MigrationInfo *info)
   1050{
   1051    MigrationState *s = migrate_get_current();
   1052    GSList *cur_blocker = migration_blockers;
   1053
   1054    info->blocked_reasons = NULL;
   1055
   1056    /*
   1057     * There are two types of reasons a migration might be blocked;
   1058     * a) devices marked in VMState as non-migratable, and
   1059     * b) Explicit migration blockers
   1060     * We need to add both of them here.
   1061     */
   1062    qemu_savevm_non_migratable_list(&info->blocked_reasons);
   1063
   1064    while (cur_blocker) {
   1065        QAPI_LIST_PREPEND(info->blocked_reasons,
   1066                          g_strdup(error_get_pretty(cur_blocker->data)));
   1067        cur_blocker = g_slist_next(cur_blocker);
   1068    }
   1069    info->has_blocked_reasons = info->blocked_reasons != NULL;
   1070
   1071    switch (s->state) {
   1072    case MIGRATION_STATUS_NONE:
   1073        /* no migration has happened ever */
   1074        /* do not overwrite destination migration status */
   1075        return;
   1076    case MIGRATION_STATUS_SETUP:
   1077        info->has_status = true;
   1078        info->has_total_time = false;
   1079        break;
   1080    case MIGRATION_STATUS_ACTIVE:
   1081    case MIGRATION_STATUS_CANCELLING:
   1082    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
   1083    case MIGRATION_STATUS_PRE_SWITCHOVER:
   1084    case MIGRATION_STATUS_DEVICE:
   1085    case MIGRATION_STATUS_POSTCOPY_PAUSED:
   1086    case MIGRATION_STATUS_POSTCOPY_RECOVER:
   1087        /* TODO add some postcopy stats */
   1088        populate_time_info(info, s);
   1089        populate_ram_info(info, s);
   1090        populate_disk_info(info);
   1091        populate_vfio_info(info);
   1092        break;
   1093    case MIGRATION_STATUS_COLO:
   1094        info->has_status = true;
   1095        /* TODO: display COLO specific information (checkpoint info etc.) */
   1096        break;
   1097    case MIGRATION_STATUS_COMPLETED:
   1098        populate_time_info(info, s);
   1099        populate_ram_info(info, s);
   1100        populate_vfio_info(info);
   1101        break;
   1102    case MIGRATION_STATUS_FAILED:
   1103        info->has_status = true;
   1104        if (s->error) {
   1105            info->has_error_desc = true;
   1106            info->error_desc = g_strdup(error_get_pretty(s->error));
   1107        }
   1108        break;
   1109    case MIGRATION_STATUS_CANCELLED:
   1110        info->has_status = true;
   1111        break;
   1112    case MIGRATION_STATUS_WAIT_UNPLUG:
   1113        info->has_status = true;
   1114        break;
   1115    }
   1116    info->status = s->state;
   1117}
   1118
   1119typedef enum WriteTrackingSupport {
   1120    WT_SUPPORT_UNKNOWN = 0,
   1121    WT_SUPPORT_ABSENT,
   1122    WT_SUPPORT_AVAILABLE,
   1123    WT_SUPPORT_COMPATIBLE
   1124} WriteTrackingSupport;
   1125
   1126static
   1127WriteTrackingSupport migrate_query_write_tracking(void)
   1128{
   1129    /* Check if kernel supports required UFFD features */
   1130    if (!ram_write_tracking_available()) {
   1131        return WT_SUPPORT_ABSENT;
   1132    }
   1133    /*
   1134     * Check if current memory configuration is
   1135     * compatible with required UFFD features.
   1136     */
   1137    if (!ram_write_tracking_compatible()) {
   1138        return WT_SUPPORT_AVAILABLE;
   1139    }
   1140
   1141    return WT_SUPPORT_COMPATIBLE;
   1142}
   1143
   1144/**
   1145 * @migration_caps_check - check capability validity
   1146 *
   1147 * @cap_list: old capability list, array of bool
   1148 * @params: new capabilities to be applied soon
   1149 * @errp: set *errp if the check failed, with reason
   1150 *
   1151 * Returns true if check passed, otherwise false.
   1152 */
   1153static bool migrate_caps_check(bool *cap_list,
   1154                               MigrationCapabilityStatusList *params,
   1155                               Error **errp)
   1156{
   1157    MigrationCapabilityStatusList *cap;
   1158    bool old_postcopy_cap;
   1159    MigrationIncomingState *mis = migration_incoming_get_current();
   1160
   1161    old_postcopy_cap = cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM];
   1162
   1163    for (cap = params; cap; cap = cap->next) {
   1164        cap_list[cap->value->capability] = cap->value->state;
   1165    }
   1166
   1167#ifndef CONFIG_LIVE_BLOCK_MIGRATION
   1168    if (cap_list[MIGRATION_CAPABILITY_BLOCK]) {
   1169        error_setg(errp, "QEMU compiled without old-style (blk/-b, inc/-i) "
   1170                   "block migration");
   1171        error_append_hint(errp, "Use drive_mirror+NBD instead.\n");
   1172        return false;
   1173    }
   1174#endif
   1175
   1176#ifndef CONFIG_REPLICATION
   1177    if (cap_list[MIGRATION_CAPABILITY_X_COLO]) {
   1178        error_setg(errp, "QEMU compiled without replication module"
   1179                   " can't enable COLO");
   1180        error_append_hint(errp, "Please enable replication before COLO.\n");
   1181        return false;
   1182    }
   1183#endif
   1184
   1185    if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
   1186        /* This check is reasonably expensive, so only when it's being
   1187         * set the first time, also it's only the destination that needs
   1188         * special support.
   1189         */
   1190        if (!old_postcopy_cap && runstate_check(RUN_STATE_INMIGRATE) &&
   1191            !postcopy_ram_supported_by_host(mis)) {
   1192            /* postcopy_ram_supported_by_host will have emitted a more
   1193             * detailed message
   1194             */
   1195            error_setg(errp, "Postcopy is not supported");
   1196            return false;
   1197        }
   1198
   1199        if (cap_list[MIGRATION_CAPABILITY_X_IGNORE_SHARED]) {
   1200            error_setg(errp, "Postcopy is not compatible with ignore-shared");
   1201            return false;
   1202        }
   1203    }
   1204
   1205    if (cap_list[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
   1206        WriteTrackingSupport wt_support;
   1207        int idx;
   1208        /*
   1209         * Check if 'background-snapshot' capability is supported by
   1210         * host kernel and compatible with guest memory configuration.
   1211         */
   1212        wt_support = migrate_query_write_tracking();
   1213        if (wt_support < WT_SUPPORT_AVAILABLE) {
   1214            error_setg(errp, "Background-snapshot is not supported by host kernel");
   1215            return false;
   1216        }
   1217        if (wt_support < WT_SUPPORT_COMPATIBLE) {
   1218            error_setg(errp, "Background-snapshot is not compatible "
   1219                    "with guest memory configuration");
   1220            return false;
   1221        }
   1222
   1223        /*
   1224         * Check if there are any migration capabilities
   1225         * incompatible with 'background-snapshot'.
   1226         */
   1227        for (idx = 0; idx < check_caps_background_snapshot.size; idx++) {
   1228            int incomp_cap = check_caps_background_snapshot.caps[idx];
   1229            if (cap_list[incomp_cap]) {
   1230                error_setg(errp,
   1231                        "Background-snapshot is not compatible with %s",
   1232                        MigrationCapability_str(incomp_cap));
   1233                return false;
   1234            }
   1235        }
   1236    }
   1237
   1238    return true;
   1239}
   1240
   1241static void fill_destination_migration_info(MigrationInfo *info)
   1242{
   1243    MigrationIncomingState *mis = migration_incoming_get_current();
   1244
   1245    if (mis->socket_address_list) {
   1246        info->has_socket_address = true;
   1247        info->socket_address =
   1248            QAPI_CLONE(SocketAddressList, mis->socket_address_list);
   1249    }
   1250
   1251    switch (mis->state) {
   1252    case MIGRATION_STATUS_NONE:
   1253        return;
   1254    case MIGRATION_STATUS_SETUP:
   1255    case MIGRATION_STATUS_CANCELLING:
   1256    case MIGRATION_STATUS_CANCELLED:
   1257    case MIGRATION_STATUS_ACTIVE:
   1258    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
   1259    case MIGRATION_STATUS_POSTCOPY_PAUSED:
   1260    case MIGRATION_STATUS_POSTCOPY_RECOVER:
   1261    case MIGRATION_STATUS_FAILED:
   1262    case MIGRATION_STATUS_COLO:
   1263        info->has_status = true;
   1264        break;
   1265    case MIGRATION_STATUS_COMPLETED:
   1266        info->has_status = true;
   1267        fill_destination_postcopy_migration_info(info);
   1268        break;
   1269    }
   1270    info->status = mis->state;
   1271}
   1272
   1273MigrationInfo *qmp_query_migrate(Error **errp)
   1274{
   1275    MigrationInfo *info = g_malloc0(sizeof(*info));
   1276
   1277    fill_destination_migration_info(info);
   1278    fill_source_migration_info(info);
   1279
   1280    return info;
   1281}
   1282
   1283void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
   1284                                  Error **errp)
   1285{
   1286    MigrationState *s = migrate_get_current();
   1287    MigrationCapabilityStatusList *cap;
   1288    bool cap_list[MIGRATION_CAPABILITY__MAX];
   1289
   1290    if (migration_is_running(s->state)) {
   1291        error_setg(errp, QERR_MIGRATION_ACTIVE);
   1292        return;
   1293    }
   1294
   1295    memcpy(cap_list, s->enabled_capabilities, sizeof(cap_list));
   1296    if (!migrate_caps_check(cap_list, params, errp)) {
   1297        return;
   1298    }
   1299
   1300    for (cap = params; cap; cap = cap->next) {
   1301        s->enabled_capabilities[cap->value->capability] = cap->value->state;
   1302    }
   1303}
   1304
   1305/*
   1306 * Check whether the parameters are valid. Error will be put into errp
   1307 * (if provided). Return true if valid, otherwise false.
   1308 */
   1309static bool migrate_params_check(MigrationParameters *params, Error **errp)
   1310{
   1311    if (params->has_compress_level &&
   1312        (params->compress_level > 9)) {
   1313        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level",
   1314                   "a value between 0 and 9");
   1315        return false;
   1316    }
   1317
   1318    if (params->has_compress_threads && (params->compress_threads < 1)) {
   1319        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
   1320                   "compress_threads",
   1321                   "a value between 1 and 255");
   1322        return false;
   1323    }
   1324
   1325    if (params->has_decompress_threads && (params->decompress_threads < 1)) {
   1326        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
   1327                   "decompress_threads",
   1328                   "a value between 1 and 255");
   1329        return false;
   1330    }
   1331
   1332    if (params->has_throttle_trigger_threshold &&
   1333        (params->throttle_trigger_threshold < 1 ||
   1334         params->throttle_trigger_threshold > 100)) {
   1335        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
   1336                   "throttle_trigger_threshold",
   1337                   "an integer in the range of 1 to 100");
   1338        return false;
   1339    }
   1340
   1341    if (params->has_cpu_throttle_initial &&
   1342        (params->cpu_throttle_initial < 1 ||
   1343         params->cpu_throttle_initial > 99)) {
   1344        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
   1345                   "cpu_throttle_initial",
   1346                   "an integer in the range of 1 to 99");
   1347        return false;
   1348    }
   1349
   1350    if (params->has_cpu_throttle_increment &&
   1351        (params->cpu_throttle_increment < 1 ||
   1352         params->cpu_throttle_increment > 99)) {
   1353        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
   1354                   "cpu_throttle_increment",
   1355                   "an integer in the range of 1 to 99");
   1356        return false;
   1357    }
   1358
   1359    if (params->has_max_bandwidth && (params->max_bandwidth > SIZE_MAX)) {
   1360        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
   1361                   "max_bandwidth",
   1362                   "an integer in the range of 0 to "stringify(SIZE_MAX)
   1363                   " bytes/second");
   1364        return false;
   1365    }
   1366
   1367    if (params->has_downtime_limit &&
   1368        (params->downtime_limit > MAX_MIGRATE_DOWNTIME)) {
   1369        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
   1370                   "downtime_limit",
   1371                   "an integer in the range of 0 to "
   1372                    stringify(MAX_MIGRATE_DOWNTIME)" ms");
   1373        return false;
   1374    }
   1375
   1376    /* x_checkpoint_delay is now always positive */
   1377
   1378    if (params->has_multifd_channels && (params->multifd_channels < 1)) {
   1379        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
   1380                   "multifd_channels",
   1381                   "a value between 1 and 255");
   1382        return false;
   1383    }
   1384
   1385    if (params->has_multifd_zlib_level &&
   1386        (params->multifd_zlib_level > 9)) {
   1387        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zlib_level",
   1388                   "a value between 0 and 9");
   1389        return false;
   1390    }
   1391
   1392    if (params->has_multifd_zstd_level &&
   1393        (params->multifd_zstd_level > 20)) {
   1394        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level",
   1395                   "a value between 0 and 20");
   1396        return false;
   1397    }
   1398
   1399    if (params->has_xbzrle_cache_size &&
   1400        (params->xbzrle_cache_size < qemu_target_page_size() ||
   1401         !is_power_of_2(params->xbzrle_cache_size))) {
   1402        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
   1403                   "xbzrle_cache_size",
   1404                   "a power of two no less than the target page size");
   1405        return false;
   1406    }
   1407
   1408    if (params->has_max_cpu_throttle &&
   1409        (params->max_cpu_throttle < params->cpu_throttle_initial ||
   1410         params->max_cpu_throttle > 99)) {
   1411        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
   1412                   "max_cpu_throttle",
   1413                   "an integer in the range of cpu_throttle_initial to 99");
   1414        return false;
   1415    }
   1416
   1417    if (params->has_announce_initial &&
   1418        params->announce_initial > 100000) {
   1419        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
   1420                   "announce_initial",
   1421                   "a value between 0 and 100000");
   1422        return false;
   1423    }
   1424    if (params->has_announce_max &&
   1425        params->announce_max > 100000) {
   1426        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
   1427                   "announce_max",
   1428                   "a value between 0 and 100000");
   1429       return false;
   1430    }
   1431    if (params->has_announce_rounds &&
   1432        params->announce_rounds > 1000) {
   1433        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
   1434                   "announce_rounds",
   1435                   "a value between 0 and 1000");
   1436       return false;
   1437    }
   1438    if (params->has_announce_step &&
   1439        (params->announce_step < 1 ||
   1440        params->announce_step > 10000)) {
   1441        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
   1442                   "announce_step",
   1443                   "a value between 0 and 10000");
   1444       return false;
   1445    }
   1446
   1447    if (params->has_block_bitmap_mapping &&
   1448        !check_dirty_bitmap_mig_alias_map(params->block_bitmap_mapping, errp)) {
   1449        error_prepend(errp, "Invalid mapping given for block-bitmap-mapping: ");
   1450        return false;
   1451    }
   1452
   1453    return true;
   1454}
   1455
   1456static void migrate_params_test_apply(MigrateSetParameters *params,
   1457                                      MigrationParameters *dest)
   1458{
   1459    *dest = migrate_get_current()->parameters;
   1460
   1461    /* TODO use QAPI_CLONE() instead of duplicating it inline */
   1462
   1463    if (params->has_compress_level) {
   1464        dest->compress_level = params->compress_level;
   1465    }
   1466
   1467    if (params->has_compress_threads) {
   1468        dest->compress_threads = params->compress_threads;
   1469    }
   1470
   1471    if (params->has_compress_wait_thread) {
   1472        dest->compress_wait_thread = params->compress_wait_thread;
   1473    }
   1474
   1475    if (params->has_decompress_threads) {
   1476        dest->decompress_threads = params->decompress_threads;
   1477    }
   1478
   1479    if (params->has_throttle_trigger_threshold) {
   1480        dest->throttle_trigger_threshold = params->throttle_trigger_threshold;
   1481    }
   1482
   1483    if (params->has_cpu_throttle_initial) {
   1484        dest->cpu_throttle_initial = params->cpu_throttle_initial;
   1485    }
   1486
   1487    if (params->has_cpu_throttle_increment) {
   1488        dest->cpu_throttle_increment = params->cpu_throttle_increment;
   1489    }
   1490
   1491    if (params->has_cpu_throttle_tailslow) {
   1492        dest->cpu_throttle_tailslow = params->cpu_throttle_tailslow;
   1493    }
   1494
   1495    if (params->has_tls_creds) {
   1496        assert(params->tls_creds->type == QTYPE_QSTRING);
   1497        dest->tls_creds = params->tls_creds->u.s;
   1498    }
   1499
   1500    if (params->has_tls_hostname) {
   1501        assert(params->tls_hostname->type == QTYPE_QSTRING);
   1502        dest->tls_hostname = params->tls_hostname->u.s;
   1503    }
   1504
   1505    if (params->has_max_bandwidth) {
   1506        dest->max_bandwidth = params->max_bandwidth;
   1507    }
   1508
   1509    if (params->has_downtime_limit) {
   1510        dest->downtime_limit = params->downtime_limit;
   1511    }
   1512
   1513    if (params->has_x_checkpoint_delay) {
   1514        dest->x_checkpoint_delay = params->x_checkpoint_delay;
   1515    }
   1516
   1517    if (params->has_block_incremental) {
   1518        dest->block_incremental = params->block_incremental;
   1519    }
   1520    if (params->has_multifd_channels) {
   1521        dest->multifd_channels = params->multifd_channels;
   1522    }
   1523    if (params->has_multifd_compression) {
   1524        dest->multifd_compression = params->multifd_compression;
   1525    }
   1526    if (params->has_xbzrle_cache_size) {
   1527        dest->xbzrle_cache_size = params->xbzrle_cache_size;
   1528    }
   1529    if (params->has_max_postcopy_bandwidth) {
   1530        dest->max_postcopy_bandwidth = params->max_postcopy_bandwidth;
   1531    }
   1532    if (params->has_max_cpu_throttle) {
   1533        dest->max_cpu_throttle = params->max_cpu_throttle;
   1534    }
   1535    if (params->has_announce_initial) {
   1536        dest->announce_initial = params->announce_initial;
   1537    }
   1538    if (params->has_announce_max) {
   1539        dest->announce_max = params->announce_max;
   1540    }
   1541    if (params->has_announce_rounds) {
   1542        dest->announce_rounds = params->announce_rounds;
   1543    }
   1544    if (params->has_announce_step) {
   1545        dest->announce_step = params->announce_step;
   1546    }
   1547
   1548    if (params->has_block_bitmap_mapping) {
   1549        dest->has_block_bitmap_mapping = true;
   1550        dest->block_bitmap_mapping = params->block_bitmap_mapping;
   1551    }
   1552}
   1553
   1554static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
   1555{
   1556    MigrationState *s = migrate_get_current();
   1557
   1558    /* TODO use QAPI_CLONE() instead of duplicating it inline */
   1559
   1560    if (params->has_compress_level) {
   1561        s->parameters.compress_level = params->compress_level;
   1562    }
   1563
   1564    if (params->has_compress_threads) {
   1565        s->parameters.compress_threads = params->compress_threads;
   1566    }
   1567
   1568    if (params->has_compress_wait_thread) {
   1569        s->parameters.compress_wait_thread = params->compress_wait_thread;
   1570    }
   1571
   1572    if (params->has_decompress_threads) {
   1573        s->parameters.decompress_threads = params->decompress_threads;
   1574    }
   1575
   1576    if (params->has_throttle_trigger_threshold) {
   1577        s->parameters.throttle_trigger_threshold = params->throttle_trigger_threshold;
   1578    }
   1579
   1580    if (params->has_cpu_throttle_initial) {
   1581        s->parameters.cpu_throttle_initial = params->cpu_throttle_initial;
   1582    }
   1583
   1584    if (params->has_cpu_throttle_increment) {
   1585        s->parameters.cpu_throttle_increment = params->cpu_throttle_increment;
   1586    }
   1587
   1588    if (params->has_cpu_throttle_tailslow) {
   1589        s->parameters.cpu_throttle_tailslow = params->cpu_throttle_tailslow;
   1590    }
   1591
   1592    if (params->has_tls_creds) {
   1593        g_free(s->parameters.tls_creds);
   1594        assert(params->tls_creds->type == QTYPE_QSTRING);
   1595        s->parameters.tls_creds = g_strdup(params->tls_creds->u.s);
   1596    }
   1597
   1598    if (params->has_tls_hostname) {
   1599        g_free(s->parameters.tls_hostname);
   1600        assert(params->tls_hostname->type == QTYPE_QSTRING);
   1601        s->parameters.tls_hostname = g_strdup(params->tls_hostname->u.s);
   1602    }
   1603
   1604    if (params->has_tls_authz) {
   1605        g_free(s->parameters.tls_authz);
   1606        assert(params->tls_authz->type == QTYPE_QSTRING);
   1607        s->parameters.tls_authz = g_strdup(params->tls_authz->u.s);
   1608    }
   1609
   1610    if (params->has_max_bandwidth) {
   1611        s->parameters.max_bandwidth = params->max_bandwidth;
   1612        if (s->to_dst_file && !migration_in_postcopy()) {
   1613            qemu_file_set_rate_limit(s->to_dst_file,
   1614                                s->parameters.max_bandwidth / XFER_LIMIT_RATIO);
   1615        }
   1616    }
   1617
   1618    if (params->has_downtime_limit) {
   1619        s->parameters.downtime_limit = params->downtime_limit;
   1620    }
   1621
   1622    if (params->has_x_checkpoint_delay) {
   1623        s->parameters.x_checkpoint_delay = params->x_checkpoint_delay;
   1624        if (migration_in_colo_state()) {
   1625            colo_checkpoint_notify(s);
   1626        }
   1627    }
   1628
   1629    if (params->has_block_incremental) {
   1630        s->parameters.block_incremental = params->block_incremental;
   1631    }
   1632    if (params->has_multifd_channels) {
   1633        s->parameters.multifd_channels = params->multifd_channels;
   1634    }
   1635    if (params->has_multifd_compression) {
   1636        s->parameters.multifd_compression = params->multifd_compression;
   1637    }
   1638    if (params->has_xbzrle_cache_size) {
   1639        s->parameters.xbzrle_cache_size = params->xbzrle_cache_size;
   1640        xbzrle_cache_resize(params->xbzrle_cache_size, errp);
   1641    }
   1642    if (params->has_max_postcopy_bandwidth) {
   1643        s->parameters.max_postcopy_bandwidth = params->max_postcopy_bandwidth;
   1644        if (s->to_dst_file && migration_in_postcopy()) {
   1645            qemu_file_set_rate_limit(s->to_dst_file,
   1646                    s->parameters.max_postcopy_bandwidth / XFER_LIMIT_RATIO);
   1647        }
   1648    }
   1649    if (params->has_max_cpu_throttle) {
   1650        s->parameters.max_cpu_throttle = params->max_cpu_throttle;
   1651    }
   1652    if (params->has_announce_initial) {
   1653        s->parameters.announce_initial = params->announce_initial;
   1654    }
   1655    if (params->has_announce_max) {
   1656        s->parameters.announce_max = params->announce_max;
   1657    }
   1658    if (params->has_announce_rounds) {
   1659        s->parameters.announce_rounds = params->announce_rounds;
   1660    }
   1661    if (params->has_announce_step) {
   1662        s->parameters.announce_step = params->announce_step;
   1663    }
   1664
   1665    if (params->has_block_bitmap_mapping) {
   1666        qapi_free_BitmapMigrationNodeAliasList(
   1667            s->parameters.block_bitmap_mapping);
   1668
   1669        s->parameters.has_block_bitmap_mapping = true;
   1670        s->parameters.block_bitmap_mapping =
   1671            QAPI_CLONE(BitmapMigrationNodeAliasList,
   1672                       params->block_bitmap_mapping);
   1673    }
   1674}
   1675
   1676void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
   1677{
   1678    MigrationParameters tmp;
   1679
   1680    /* TODO Rewrite "" to null instead */
   1681    if (params->has_tls_creds
   1682        && params->tls_creds->type == QTYPE_QNULL) {
   1683        qobject_unref(params->tls_creds->u.n);
   1684        params->tls_creds->type = QTYPE_QSTRING;
   1685        params->tls_creds->u.s = strdup("");
   1686    }
   1687    /* TODO Rewrite "" to null instead */
   1688    if (params->has_tls_hostname
   1689        && params->tls_hostname->type == QTYPE_QNULL) {
   1690        qobject_unref(params->tls_hostname->u.n);
   1691        params->tls_hostname->type = QTYPE_QSTRING;
   1692        params->tls_hostname->u.s = strdup("");
   1693    }
   1694
   1695    migrate_params_test_apply(params, &tmp);
   1696
   1697    if (!migrate_params_check(&tmp, errp)) {
   1698        /* Invalid parameter */
   1699        return;
   1700    }
   1701
   1702    migrate_params_apply(params, errp);
   1703}
   1704
   1705
   1706void qmp_migrate_start_postcopy(Error **errp)
   1707{
   1708    MigrationState *s = migrate_get_current();
   1709
   1710    if (!migrate_postcopy()) {
   1711        error_setg(errp, "Enable postcopy with migrate_set_capability before"
   1712                         " the start of migration");
   1713        return;
   1714    }
   1715
   1716    if (s->state == MIGRATION_STATUS_NONE) {
   1717        error_setg(errp, "Postcopy must be started after migration has been"
   1718                         " started");
   1719        return;
   1720    }
   1721    /*
   1722     * we don't error if migration has finished since that would be racy
   1723     * with issuing this command.
   1724     */
   1725    qatomic_set(&s->start_postcopy, true);
   1726}
   1727
   1728/* shared migration helpers */
   1729
   1730void migrate_set_state(int *state, int old_state, int new_state)
   1731{
   1732    assert(new_state < MIGRATION_STATUS__MAX);
   1733    if (qatomic_cmpxchg(state, old_state, new_state) == old_state) {
   1734        trace_migrate_set_state(MigrationStatus_str(new_state));
   1735        migrate_generate_event(new_state);
   1736    }
   1737}
   1738
   1739static MigrationCapabilityStatus *migrate_cap_add(MigrationCapability index,
   1740                                                  bool state)
   1741{
   1742    MigrationCapabilityStatus *cap;
   1743
   1744    cap = g_new0(MigrationCapabilityStatus, 1);
   1745    cap->capability = index;
   1746    cap->state = state;
   1747
   1748    return cap;
   1749}
   1750
   1751void migrate_set_block_enabled(bool value, Error **errp)
   1752{
   1753    MigrationCapabilityStatusList *cap = NULL;
   1754
   1755    QAPI_LIST_PREPEND(cap, migrate_cap_add(MIGRATION_CAPABILITY_BLOCK, value));
   1756    qmp_migrate_set_capabilities(cap, errp);
   1757    qapi_free_MigrationCapabilityStatusList(cap);
   1758}
   1759
   1760static void migrate_set_block_incremental(MigrationState *s, bool value)
   1761{
   1762    s->parameters.block_incremental = value;
   1763}
   1764
   1765static void block_cleanup_parameters(MigrationState *s)
   1766{
   1767    if (s->must_remove_block_options) {
   1768        /* setting to false can never fail */
   1769        migrate_set_block_enabled(false, &error_abort);
   1770        migrate_set_block_incremental(s, false);
   1771        s->must_remove_block_options = false;
   1772    }
   1773}
   1774
   1775static void migrate_fd_cleanup(MigrationState *s)
   1776{
   1777    qemu_bh_delete(s->cleanup_bh);
   1778    s->cleanup_bh = NULL;
   1779
   1780    qemu_savevm_state_cleanup();
   1781
   1782    if (s->to_dst_file) {
   1783        QEMUFile *tmp;
   1784
   1785        trace_migrate_fd_cleanup();
   1786        qemu_mutex_unlock_iothread();
   1787        if (s->migration_thread_running) {
   1788            qemu_thread_join(&s->thread);
   1789            s->migration_thread_running = false;
   1790        }
   1791        qemu_mutex_lock_iothread();
   1792
   1793        multifd_save_cleanup();
   1794        qemu_mutex_lock(&s->qemu_file_lock);
   1795        tmp = s->to_dst_file;
   1796        s->to_dst_file = NULL;
   1797        qemu_mutex_unlock(&s->qemu_file_lock);
   1798        /*
   1799         * Close the file handle without the lock to make sure the
   1800         * critical section won't block for long.
   1801         */
   1802        migration_ioc_unregister_yank_from_file(tmp);
   1803        qemu_fclose(tmp);
   1804    }
   1805
   1806    assert(!migration_is_active(s));
   1807
   1808    if (s->state == MIGRATION_STATUS_CANCELLING) {
   1809        migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING,
   1810                          MIGRATION_STATUS_CANCELLED);
   1811    }
   1812
   1813    if (s->error) {
   1814        /* It is used on info migrate.  We can't free it */
   1815        error_report_err(error_copy(s->error));
   1816    }
   1817    notifier_list_notify(&migration_state_notifiers, s);
   1818    block_cleanup_parameters(s);
   1819    yank_unregister_instance(MIGRATION_YANK_INSTANCE);
   1820}
   1821
   1822static void migrate_fd_cleanup_schedule(MigrationState *s)
   1823{
   1824    /*
   1825     * Ref the state for bh, because it may be called when
   1826     * there're already no other refs
   1827     */
   1828    object_ref(OBJECT(s));
   1829    qemu_bh_schedule(s->cleanup_bh);
   1830}
   1831
   1832static void migrate_fd_cleanup_bh(void *opaque)
   1833{
   1834    MigrationState *s = opaque;
   1835    migrate_fd_cleanup(s);
   1836    object_unref(OBJECT(s));
   1837}
   1838
   1839void migrate_set_error(MigrationState *s, const Error *error)
   1840{
   1841    QEMU_LOCK_GUARD(&s->error_mutex);
   1842    if (!s->error) {
   1843        s->error = error_copy(error);
   1844    }
   1845}
   1846
   1847static void migrate_error_free(MigrationState *s)
   1848{
   1849    QEMU_LOCK_GUARD(&s->error_mutex);
   1850    if (s->error) {
   1851        error_free(s->error);
   1852        s->error = NULL;
   1853    }
   1854}
   1855
   1856void migrate_fd_error(MigrationState *s, const Error *error)
   1857{
   1858    trace_migrate_fd_error(error_get_pretty(error));
   1859    assert(s->to_dst_file == NULL);
   1860    migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
   1861                      MIGRATION_STATUS_FAILED);
   1862    migrate_set_error(s, error);
   1863}
   1864
   1865static void migrate_fd_cancel(MigrationState *s)
   1866{
   1867    int old_state ;
   1868    QEMUFile *f = migrate_get_current()->to_dst_file;
   1869    trace_migrate_fd_cancel();
   1870
   1871    WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
   1872        if (s->rp_state.from_dst_file) {
   1873            /* shutdown the rp socket, so causing the rp thread to shutdown */
   1874            qemu_file_shutdown(s->rp_state.from_dst_file);
   1875        }
   1876    }
   1877
   1878    do {
   1879        old_state = s->state;
   1880        if (!migration_is_running(old_state)) {
   1881            break;
   1882        }
   1883        /* If the migration is paused, kick it out of the pause */
   1884        if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) {
   1885            qemu_sem_post(&s->pause_sem);
   1886        }
   1887        migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING);
   1888    } while (s->state != MIGRATION_STATUS_CANCELLING);
   1889
   1890    /*
   1891     * If we're unlucky the migration code might be stuck somewhere in a
   1892     * send/write while the network has failed and is waiting to timeout;
   1893     * if we've got shutdown(2) available then we can force it to quit.
   1894     * The outgoing qemu file gets closed in migrate_fd_cleanup that is
   1895     * called in a bh, so there is no race against this cancel.
   1896     */
   1897    if (s->state == MIGRATION_STATUS_CANCELLING && f) {
   1898        qemu_file_shutdown(f);
   1899    }
   1900    if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) {
   1901        Error *local_err = NULL;
   1902
   1903        bdrv_invalidate_cache_all(&local_err);
   1904        if (local_err) {
   1905            error_report_err(local_err);
   1906        } else {
   1907            s->block_inactive = false;
   1908        }
   1909    }
   1910}
   1911
   1912void add_migration_state_change_notifier(Notifier *notify)
   1913{
   1914    notifier_list_add(&migration_state_notifiers, notify);
   1915}
   1916
   1917void remove_migration_state_change_notifier(Notifier *notify)
   1918{
   1919    notifier_remove(notify);
   1920}
   1921
   1922bool migration_in_setup(MigrationState *s)
   1923{
   1924    return s->state == MIGRATION_STATUS_SETUP;
   1925}
   1926
   1927bool migration_has_finished(MigrationState *s)
   1928{
   1929    return s->state == MIGRATION_STATUS_COMPLETED;
   1930}
   1931
   1932bool migration_has_failed(MigrationState *s)
   1933{
   1934    return (s->state == MIGRATION_STATUS_CANCELLED ||
   1935            s->state == MIGRATION_STATUS_FAILED);
   1936}
   1937
   1938bool migration_in_postcopy(void)
   1939{
   1940    MigrationState *s = migrate_get_current();
   1941
   1942    switch (s->state) {
   1943    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
   1944    case MIGRATION_STATUS_POSTCOPY_PAUSED:
   1945    case MIGRATION_STATUS_POSTCOPY_RECOVER:
   1946        return true;
   1947    default:
   1948        return false;
   1949    }
   1950}
   1951
   1952bool migration_in_postcopy_after_devices(MigrationState *s)
   1953{
   1954    return migration_in_postcopy() && s->postcopy_after_devices;
   1955}
   1956
   1957bool migration_in_incoming_postcopy(void)
   1958{
   1959    PostcopyState ps = postcopy_state_get();
   1960
   1961    return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END;
   1962}
   1963
   1964bool migration_in_bg_snapshot(void)
   1965{
   1966    MigrationState *s = migrate_get_current();
   1967
   1968    return migrate_background_snapshot() &&
   1969            migration_is_setup_or_active(s->state);
   1970}
   1971
   1972bool migration_is_idle(void)
   1973{
   1974    MigrationState *s = current_migration;
   1975
   1976    if (!s) {
   1977        return true;
   1978    }
   1979
   1980    switch (s->state) {
   1981    case MIGRATION_STATUS_NONE:
   1982    case MIGRATION_STATUS_CANCELLED:
   1983    case MIGRATION_STATUS_COMPLETED:
   1984    case MIGRATION_STATUS_FAILED:
   1985        return true;
   1986    case MIGRATION_STATUS_SETUP:
   1987    case MIGRATION_STATUS_CANCELLING:
   1988    case MIGRATION_STATUS_ACTIVE:
   1989    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
   1990    case MIGRATION_STATUS_COLO:
   1991    case MIGRATION_STATUS_PRE_SWITCHOVER:
   1992    case MIGRATION_STATUS_DEVICE:
   1993    case MIGRATION_STATUS_WAIT_UNPLUG:
   1994        return false;
   1995    case MIGRATION_STATUS__MAX:
   1996        g_assert_not_reached();
   1997    }
   1998
   1999    return false;
   2000}
   2001
   2002bool migration_is_active(MigrationState *s)
   2003{
   2004    return (s->state == MIGRATION_STATUS_ACTIVE ||
   2005            s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
   2006}
   2007
   2008void migrate_init(MigrationState *s)
   2009{
   2010    /*
   2011     * Reinitialise all migration state, except
   2012     * parameters/capabilities that the user set, and
   2013     * locks.
   2014     */
   2015    s->cleanup_bh = 0;
   2016    s->vm_start_bh = 0;
   2017    s->to_dst_file = NULL;
   2018    s->state = MIGRATION_STATUS_NONE;
   2019    s->rp_state.from_dst_file = NULL;
   2020    s->rp_state.error = false;
   2021    s->mbps = 0.0;
   2022    s->pages_per_second = 0.0;
   2023    s->downtime = 0;
   2024    s->expected_downtime = 0;
   2025    s->setup_time = 0;
   2026    s->start_postcopy = false;
   2027    s->postcopy_after_devices = false;
   2028    s->migration_thread_running = false;
   2029    error_free(s->error);
   2030    s->error = NULL;
   2031    s->hostname = NULL;
   2032
   2033    migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
   2034
   2035    s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
   2036    s->total_time = 0;
   2037    s->vm_was_running = false;
   2038    s->iteration_initial_bytes = 0;
   2039    s->threshold_size = 0;
   2040}
   2041
   2042int migrate_add_blocker(Error *reason, Error **errp)
   2043{
   2044    if (only_migratable) {
   2045        error_propagate_prepend(errp, error_copy(reason),
   2046                                "disallowing migration blocker "
   2047                                "(--only-migratable) for: ");
   2048        return -EACCES;
   2049    }
   2050
   2051    if (migration_is_idle()) {
   2052        migration_blockers = g_slist_prepend(migration_blockers, reason);
   2053        return 0;
   2054    }
   2055
   2056    error_propagate_prepend(errp, error_copy(reason),
   2057                            "disallowing migration blocker "
   2058                            "(migration in progress) for: ");
   2059    return -EBUSY;
   2060}
   2061
   2062void migrate_del_blocker(Error *reason)
   2063{
   2064    migration_blockers = g_slist_remove(migration_blockers, reason);
   2065}
   2066
   2067void qmp_migrate_incoming(const char *uri, Error **errp)
   2068{
   2069    Error *local_err = NULL;
   2070    static bool once = true;
   2071
   2072    if (!once) {
   2073        error_setg(errp, "The incoming migration has already been started");
   2074        return;
   2075    }
   2076    if (!runstate_check(RUN_STATE_INMIGRATE)) {
   2077        error_setg(errp, "'-incoming' was not specified on the command line");
   2078        return;
   2079    }
   2080
   2081    if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
   2082        return;
   2083    }
   2084
   2085    qemu_start_incoming_migration(uri, &local_err);
   2086
   2087    if (local_err) {
   2088        yank_unregister_instance(MIGRATION_YANK_INSTANCE);
   2089        error_propagate(errp, local_err);
   2090        return;
   2091    }
   2092
   2093    once = false;
   2094}
   2095
   2096void qmp_migrate_recover(const char *uri, Error **errp)
   2097{
   2098    MigrationIncomingState *mis = migration_incoming_get_current();
   2099
   2100    /*
   2101     * Don't even bother to use ERRP_GUARD() as it _must_ always be set by
   2102     * callers (no one should ignore a recover failure); if there is, it's a
   2103     * programming error.
   2104     */
   2105    assert(errp);
   2106
   2107    if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
   2108        error_setg(errp, "Migrate recover can only be run "
   2109                   "when postcopy is paused.");
   2110        return;
   2111    }
   2112
   2113    if (qatomic_cmpxchg(&mis->postcopy_recover_triggered,
   2114                       false, true) == true) {
   2115        error_setg(errp, "Migrate recovery is triggered already");
   2116        return;
   2117    }
   2118
   2119    /*
   2120     * Note that this call will never start a real migration; it will
   2121     * only re-setup the migration stream and poke existing migration
   2122     * to continue using that newly established channel.
   2123     */
   2124    qemu_start_incoming_migration(uri, errp);
   2125
   2126    /* Safe to dereference with the assert above */
   2127    if (*errp) {
   2128        /* Reset the flag so user could still retry */
   2129        qatomic_set(&mis->postcopy_recover_triggered, false);
   2130    }
   2131}
   2132
   2133void qmp_migrate_pause(Error **errp)
   2134{
   2135    MigrationState *ms = migrate_get_current();
   2136    MigrationIncomingState *mis = migration_incoming_get_current();
   2137    int ret;
   2138
   2139    if (ms->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
   2140        /* Source side, during postcopy */
   2141        qemu_mutex_lock(&ms->qemu_file_lock);
   2142        ret = qemu_file_shutdown(ms->to_dst_file);
   2143        qemu_mutex_unlock(&ms->qemu_file_lock);
   2144        if (ret) {
   2145            error_setg(errp, "Failed to pause source migration");
   2146        }
   2147        return;
   2148    }
   2149
   2150    if (mis->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
   2151        ret = qemu_file_shutdown(mis->from_src_file);
   2152        if (ret) {
   2153            error_setg(errp, "Failed to pause destination migration");
   2154        }
   2155        return;
   2156    }
   2157
   2158    error_setg(errp, "migrate-pause is currently only supported "
   2159               "during postcopy-active state");
   2160}
   2161
   2162bool migration_is_blocked(Error **errp)
   2163{
   2164    if (qemu_savevm_state_blocked(errp)) {
   2165        return true;
   2166    }
   2167
   2168    if (migration_blockers) {
   2169        error_propagate(errp, error_copy(migration_blockers->data));
   2170        return true;
   2171    }
   2172
   2173    return false;
   2174}
   2175
   2176/* Returns true if continue to migrate, or false if error detected */
   2177static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc,
   2178                            bool resume, Error **errp)
   2179{
   2180    Error *local_err = NULL;
   2181
   2182    if (resume) {
   2183        if (s->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
   2184            error_setg(errp, "Cannot resume if there is no "
   2185                       "paused migration");
   2186            return false;
   2187        }
   2188
   2189        /*
   2190         * Postcopy recovery won't work well with release-ram
   2191         * capability since release-ram will drop the page buffer as
   2192         * long as the page is put into the send buffer.  So if there
   2193         * is a network failure happened, any page buffers that have
   2194         * not yet reached the destination VM but have already been
   2195         * sent from the source VM will be lost forever.  Let's refuse
   2196         * the client from resuming such a postcopy migration.
   2197         * Luckily release-ram was designed to only be used when src
   2198         * and destination VMs are on the same host, so it should be
   2199         * fine.
   2200         */
   2201        if (migrate_release_ram()) {
   2202            error_setg(errp, "Postcopy recovery cannot work "
   2203                       "when release-ram capability is set");
   2204            return false;
   2205        }
   2206
   2207        /* This is a resume, skip init status */
   2208        return true;
   2209    }
   2210
   2211    if (migration_is_running(s->state)) {
   2212        error_setg(errp, QERR_MIGRATION_ACTIVE);
   2213        return false;
   2214    }
   2215
   2216    if (runstate_check(RUN_STATE_INMIGRATE)) {
   2217        error_setg(errp, "Guest is waiting for an incoming migration");
   2218        return false;
   2219    }
   2220
   2221    if (runstate_check(RUN_STATE_POSTMIGRATE)) {
   2222        error_setg(errp, "Can't migrate the vm that was paused due to "
   2223                   "previous migration");
   2224        return false;
   2225    }
   2226
   2227    if (migration_is_blocked(errp)) {
   2228        return false;
   2229    }
   2230
   2231    if (blk || blk_inc) {
   2232        if (migrate_colo_enabled()) {
   2233            error_setg(errp, "No disk migration is required in COLO mode");
   2234            return false;
   2235        }
   2236        if (migrate_use_block() || migrate_use_block_incremental()) {
   2237            error_setg(errp, "Command options are incompatible with "
   2238                       "current migration capabilities");
   2239            return false;
   2240        }
   2241        migrate_set_block_enabled(true, &local_err);
   2242        if (local_err) {
   2243            error_propagate(errp, local_err);
   2244            return false;
   2245        }
   2246        s->must_remove_block_options = true;
   2247    }
   2248
   2249    if (blk_inc) {
   2250        migrate_set_block_incremental(s, true);
   2251    }
   2252
   2253    migrate_init(s);
   2254    /*
   2255     * set ram_counters memory to zero for a
   2256     * new migration
   2257     */
   2258    memset(&ram_counters, 0, sizeof(ram_counters));
   2259
   2260    return true;
   2261}
   2262
   2263void qmp_migrate(const char *uri, bool has_blk, bool blk,
   2264                 bool has_inc, bool inc, bool has_detach, bool detach,
   2265                 bool has_resume, bool resume, Error **errp)
   2266{
   2267    Error *local_err = NULL;
   2268    MigrationState *s = migrate_get_current();
   2269    const char *p = NULL;
   2270
   2271    if (!migrate_prepare(s, has_blk && blk, has_inc && inc,
   2272                         has_resume && resume, errp)) {
   2273        /* Error detected, put into errp */
   2274        return;
   2275    }
   2276
   2277    if (!(has_resume && resume)) {
   2278        if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
   2279            return;
   2280        }
   2281    }
   2282
   2283    if (strstart(uri, "tcp:", &p) ||
   2284        strstart(uri, "unix:", NULL) ||
   2285        strstart(uri, "vsock:", NULL)) {
   2286        socket_start_outgoing_migration(s, p ? p : uri, &local_err);
   2287#ifdef CONFIG_RDMA
   2288    } else if (strstart(uri, "rdma:", &p)) {
   2289        rdma_start_outgoing_migration(s, p, &local_err);
   2290#endif
   2291    } else if (strstart(uri, "exec:", &p)) {
   2292        exec_start_outgoing_migration(s, p, &local_err);
   2293    } else if (strstart(uri, "fd:", &p)) {
   2294        fd_start_outgoing_migration(s, p, &local_err);
   2295    } else {
   2296        if (!(has_resume && resume)) {
   2297            yank_unregister_instance(MIGRATION_YANK_INSTANCE);
   2298        }
   2299        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri",
   2300                   "a valid migration protocol");
   2301        migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
   2302                          MIGRATION_STATUS_FAILED);
   2303        block_cleanup_parameters(s);
   2304        return;
   2305    }
   2306
   2307    if (local_err) {
   2308        if (!(has_resume && resume)) {
   2309            yank_unregister_instance(MIGRATION_YANK_INSTANCE);
   2310        }
   2311        migrate_fd_error(s, local_err);
   2312        error_propagate(errp, local_err);
   2313        return;
   2314    }
   2315}
   2316
   2317void qmp_migrate_cancel(Error **errp)
   2318{
   2319    migration_cancel();
   2320}
   2321
   2322void qmp_migrate_continue(MigrationStatus state, Error **errp)
   2323{
   2324    MigrationState *s = migrate_get_current();
   2325    if (s->state != state) {
   2326        error_setg(errp,  "Migration not in expected state: %s",
   2327                   MigrationStatus_str(s->state));
   2328        return;
   2329    }
   2330    qemu_sem_post(&s->pause_sem);
   2331}
   2332
   2333bool migrate_release_ram(void)
   2334{
   2335    MigrationState *s;
   2336
   2337    s = migrate_get_current();
   2338
   2339    return s->enabled_capabilities[MIGRATION_CAPABILITY_RELEASE_RAM];
   2340}
   2341
   2342bool migrate_postcopy_ram(void)
   2343{
   2344    MigrationState *s;
   2345
   2346    s = migrate_get_current();
   2347
   2348    return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM];
   2349}
   2350
   2351bool migrate_postcopy(void)
   2352{
   2353    return migrate_postcopy_ram() || migrate_dirty_bitmaps();
   2354}
   2355
   2356bool migrate_auto_converge(void)
   2357{
   2358    MigrationState *s;
   2359
   2360    s = migrate_get_current();
   2361
   2362    return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE];
   2363}
   2364
   2365bool migrate_zero_blocks(void)
   2366{
   2367    MigrationState *s;
   2368
   2369    s = migrate_get_current();
   2370
   2371    return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS];
   2372}
   2373
   2374bool migrate_postcopy_blocktime(void)
   2375{
   2376    MigrationState *s;
   2377
   2378    s = migrate_get_current();
   2379
   2380    return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME];
   2381}
   2382
   2383bool migrate_use_compression(void)
   2384{
   2385    MigrationState *s;
   2386
   2387    s = migrate_get_current();
   2388
   2389    return s->enabled_capabilities[MIGRATION_CAPABILITY_COMPRESS];
   2390}
   2391
   2392int migrate_compress_level(void)
   2393{
   2394    MigrationState *s;
   2395
   2396    s = migrate_get_current();
   2397
   2398    return s->parameters.compress_level;
   2399}
   2400
   2401int migrate_compress_threads(void)
   2402{
   2403    MigrationState *s;
   2404
   2405    s = migrate_get_current();
   2406
   2407    return s->parameters.compress_threads;
   2408}
   2409
   2410int migrate_compress_wait_thread(void)
   2411{
   2412    MigrationState *s;
   2413
   2414    s = migrate_get_current();
   2415
   2416    return s->parameters.compress_wait_thread;
   2417}
   2418
   2419int migrate_decompress_threads(void)
   2420{
   2421    MigrationState *s;
   2422
   2423    s = migrate_get_current();
   2424
   2425    return s->parameters.decompress_threads;
   2426}
   2427
   2428bool migrate_dirty_bitmaps(void)
   2429{
   2430    MigrationState *s;
   2431
   2432    s = migrate_get_current();
   2433
   2434    return s->enabled_capabilities[MIGRATION_CAPABILITY_DIRTY_BITMAPS];
   2435}
   2436
   2437bool migrate_ignore_shared(void)
   2438{
   2439    MigrationState *s;
   2440
   2441    s = migrate_get_current();
   2442
   2443    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_IGNORE_SHARED];
   2444}
   2445
   2446bool migrate_validate_uuid(void)
   2447{
   2448    MigrationState *s;
   2449
   2450    s = migrate_get_current();
   2451
   2452    return s->enabled_capabilities[MIGRATION_CAPABILITY_VALIDATE_UUID];
   2453}
   2454
   2455bool migrate_use_events(void)
   2456{
   2457    MigrationState *s;
   2458
   2459    s = migrate_get_current();
   2460
   2461    return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS];
   2462}
   2463
   2464bool migrate_use_multifd(void)
   2465{
   2466    MigrationState *s;
   2467
   2468    s = migrate_get_current();
   2469
   2470    return s->enabled_capabilities[MIGRATION_CAPABILITY_MULTIFD];
   2471}
   2472
   2473bool migrate_pause_before_switchover(void)
   2474{
   2475    MigrationState *s;
   2476
   2477    s = migrate_get_current();
   2478
   2479    return s->enabled_capabilities[
   2480        MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER];
   2481}
   2482
   2483int migrate_multifd_channels(void)
   2484{
   2485    MigrationState *s;
   2486
   2487    s = migrate_get_current();
   2488
   2489    return s->parameters.multifd_channels;
   2490}
   2491
   2492MultiFDCompression migrate_multifd_compression(void)
   2493{
   2494    MigrationState *s;
   2495
   2496    s = migrate_get_current();
   2497
   2498    return s->parameters.multifd_compression;
   2499}
   2500
   2501int migrate_multifd_zlib_level(void)
   2502{
   2503    MigrationState *s;
   2504
   2505    s = migrate_get_current();
   2506
   2507    return s->parameters.multifd_zlib_level;
   2508}
   2509
   2510int migrate_multifd_zstd_level(void)
   2511{
   2512    MigrationState *s;
   2513
   2514    s = migrate_get_current();
   2515
   2516    return s->parameters.multifd_zstd_level;
   2517}
   2518
   2519int migrate_use_xbzrle(void)
   2520{
   2521    MigrationState *s;
   2522
   2523    s = migrate_get_current();
   2524
   2525    return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE];
   2526}
   2527
   2528uint64_t migrate_xbzrle_cache_size(void)
   2529{
   2530    MigrationState *s;
   2531
   2532    s = migrate_get_current();
   2533
   2534    return s->parameters.xbzrle_cache_size;
   2535}
   2536
   2537static int64_t migrate_max_postcopy_bandwidth(void)
   2538{
   2539    MigrationState *s;
   2540
   2541    s = migrate_get_current();
   2542
   2543    return s->parameters.max_postcopy_bandwidth;
   2544}
   2545
   2546bool migrate_use_block(void)
   2547{
   2548    MigrationState *s;
   2549
   2550    s = migrate_get_current();
   2551
   2552    return s->enabled_capabilities[MIGRATION_CAPABILITY_BLOCK];
   2553}
   2554
   2555bool migrate_use_return_path(void)
   2556{
   2557    MigrationState *s;
   2558
   2559    s = migrate_get_current();
   2560
   2561    return s->enabled_capabilities[MIGRATION_CAPABILITY_RETURN_PATH];
   2562}
   2563
   2564bool migrate_use_block_incremental(void)
   2565{
   2566    MigrationState *s;
   2567
   2568    s = migrate_get_current();
   2569
   2570    return s->parameters.block_incremental;
   2571}
   2572
   2573bool migrate_background_snapshot(void)
   2574{
   2575    MigrationState *s;
   2576
   2577    s = migrate_get_current();
   2578
   2579    return s->enabled_capabilities[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT];
   2580}
   2581
   2582/* migration thread support */
   2583/*
   2584 * Something bad happened to the RP stream, mark an error
   2585 * The caller shall print or trace something to indicate why
   2586 */
   2587static void mark_source_rp_bad(MigrationState *s)
   2588{
   2589    s->rp_state.error = true;
   2590}
   2591
   2592static struct rp_cmd_args {
   2593    ssize_t     len; /* -1 = variable */
   2594    const char *name;
   2595} rp_cmd_args[] = {
   2596    [MIG_RP_MSG_INVALID]        = { .len = -1, .name = "INVALID" },
   2597    [MIG_RP_MSG_SHUT]           = { .len =  4, .name = "SHUT" },
   2598    [MIG_RP_MSG_PONG]           = { .len =  4, .name = "PONG" },
   2599    [MIG_RP_MSG_REQ_PAGES]      = { .len = 12, .name = "REQ_PAGES" },
   2600    [MIG_RP_MSG_REQ_PAGES_ID]   = { .len = -1, .name = "REQ_PAGES_ID" },
   2601    [MIG_RP_MSG_RECV_BITMAP]    = { .len = -1, .name = "RECV_BITMAP" },
   2602    [MIG_RP_MSG_RESUME_ACK]     = { .len =  4, .name = "RESUME_ACK" },
   2603    [MIG_RP_MSG_MAX]            = { .len = -1, .name = "MAX" },
   2604};
   2605
   2606/*
   2607 * Process a request for pages received on the return path,
   2608 * We're allowed to send more than requested (e.g. to round to our page size)
   2609 * and we don't need to send pages that have already been sent.
   2610 */
   2611static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
   2612                                       ram_addr_t start, size_t len)
   2613{
   2614    long our_host_ps = qemu_real_host_page_size;
   2615
   2616    trace_migrate_handle_rp_req_pages(rbname, start, len);
   2617
   2618    /*
   2619     * Since we currently insist on matching page sizes, just sanity check
   2620     * we're being asked for whole host pages.
   2621     */
   2622    if (start & (our_host_ps - 1) ||
   2623       (len & (our_host_ps - 1))) {
   2624        error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
   2625                     " len: %zd", __func__, start, len);
   2626        mark_source_rp_bad(ms);
   2627        return;
   2628    }
   2629
   2630    if (ram_save_queue_pages(rbname, start, len)) {
   2631        mark_source_rp_bad(ms);
   2632    }
   2633}
   2634
   2635/* Return true to retry, false to quit */
   2636static bool postcopy_pause_return_path_thread(MigrationState *s)
   2637{
   2638    trace_postcopy_pause_return_path();
   2639
   2640    qemu_sem_wait(&s->postcopy_pause_rp_sem);
   2641
   2642    trace_postcopy_pause_return_path_continued();
   2643
   2644    return true;
   2645}
   2646
   2647static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name)
   2648{
   2649    RAMBlock *block = qemu_ram_block_by_name(block_name);
   2650
   2651    if (!block) {
   2652        error_report("%s: invalid block name '%s'", __func__, block_name);
   2653        return -EINVAL;
   2654    }
   2655
   2656    /* Fetch the received bitmap and refresh the dirty bitmap */
   2657    return ram_dirty_bitmap_reload(s, block);
   2658}
   2659
   2660static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value)
   2661{
   2662    trace_source_return_path_thread_resume_ack(value);
   2663
   2664    if (value != MIGRATION_RESUME_ACK_VALUE) {
   2665        error_report("%s: illegal resume_ack value %"PRIu32,
   2666                     __func__, value);
   2667        return -1;
   2668    }
   2669
   2670    /* Now both sides are active. */
   2671    migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
   2672                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
   2673
   2674    /* Notify send thread that time to continue send pages */
   2675    qemu_sem_post(&s->rp_state.rp_sem);
   2676
   2677    return 0;
   2678}
   2679
   2680/* Release ms->rp_state.from_dst_file in a safe way */
   2681static void migration_release_from_dst_file(MigrationState *ms)
   2682{
   2683    QEMUFile *file;
   2684
   2685    WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
   2686        /*
   2687         * Reset the from_dst_file pointer first before releasing it, as we
   2688         * can't block within lock section
   2689         */
   2690        file = ms->rp_state.from_dst_file;
   2691        ms->rp_state.from_dst_file = NULL;
   2692    }
   2693
   2694    qemu_fclose(file);
   2695}
   2696
   2697/*
   2698 * Handles messages sent on the return path towards the source VM
   2699 *
   2700 */
   2701static void *source_return_path_thread(void *opaque)
   2702{
   2703    MigrationState *ms = opaque;
   2704    QEMUFile *rp = ms->rp_state.from_dst_file;
   2705    uint16_t header_len, header_type;
   2706    uint8_t buf[512];
   2707    uint32_t tmp32, sibling_error;
   2708    ram_addr_t start = 0; /* =0 to silence warning */
   2709    size_t  len = 0, expected_len;
   2710    int res;
   2711
   2712    trace_source_return_path_thread_entry();
   2713    rcu_register_thread();
   2714
   2715retry:
   2716    while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
   2717           migration_is_setup_or_active(ms->state)) {
   2718        trace_source_return_path_thread_loop_top();
   2719        header_type = qemu_get_be16(rp);
   2720        header_len = qemu_get_be16(rp);
   2721
   2722        if (qemu_file_get_error(rp)) {
   2723            mark_source_rp_bad(ms);
   2724            goto out;
   2725        }
   2726
   2727        if (header_type >= MIG_RP_MSG_MAX ||
   2728            header_type == MIG_RP_MSG_INVALID) {
   2729            error_report("RP: Received invalid message 0x%04x length 0x%04x",
   2730                         header_type, header_len);
   2731            mark_source_rp_bad(ms);
   2732            goto out;
   2733        }
   2734
   2735        if ((rp_cmd_args[header_type].len != -1 &&
   2736            header_len != rp_cmd_args[header_type].len) ||
   2737            header_len > sizeof(buf)) {
   2738            error_report("RP: Received '%s' message (0x%04x) with"
   2739                         "incorrect length %d expecting %zu",
   2740                         rp_cmd_args[header_type].name, header_type, header_len,
   2741                         (size_t)rp_cmd_args[header_type].len);
   2742            mark_source_rp_bad(ms);
   2743            goto out;
   2744        }
   2745
   2746        /* We know we've got a valid header by this point */
   2747        res = qemu_get_buffer(rp, buf, header_len);
   2748        if (res != header_len) {
   2749            error_report("RP: Failed reading data for message 0x%04x"
   2750                         " read %d expected %d",
   2751                         header_type, res, header_len);
   2752            mark_source_rp_bad(ms);
   2753            goto out;
   2754        }
   2755
   2756        /* OK, we have the message and the data */
   2757        switch (header_type) {
   2758        case MIG_RP_MSG_SHUT:
   2759            sibling_error = ldl_be_p(buf);
   2760            trace_source_return_path_thread_shut(sibling_error);
   2761            if (sibling_error) {
   2762                error_report("RP: Sibling indicated error %d", sibling_error);
   2763                mark_source_rp_bad(ms);
   2764            }
   2765            /*
   2766             * We'll let the main thread deal with closing the RP
   2767             * we could do a shutdown(2) on it, but we're the only user
   2768             * anyway, so there's nothing gained.
   2769             */
   2770            goto out;
   2771
   2772        case MIG_RP_MSG_PONG:
   2773            tmp32 = ldl_be_p(buf);
   2774            trace_source_return_path_thread_pong(tmp32);
   2775            break;
   2776
   2777        case MIG_RP_MSG_REQ_PAGES:
   2778            start = ldq_be_p(buf);
   2779            len = ldl_be_p(buf + 8);
   2780            migrate_handle_rp_req_pages(ms, NULL, start, len);
   2781            break;
   2782
   2783        case MIG_RP_MSG_REQ_PAGES_ID:
   2784            expected_len = 12 + 1; /* header + termination */
   2785
   2786            if (header_len >= expected_len) {
   2787                start = ldq_be_p(buf);
   2788                len = ldl_be_p(buf + 8);
   2789                /* Now we expect an idstr */
   2790                tmp32 = buf[12]; /* Length of the following idstr */
   2791                buf[13 + tmp32] = '\0';
   2792                expected_len += tmp32;
   2793            }
   2794            if (header_len != expected_len) {
   2795                error_report("RP: Req_Page_id with length %d expecting %zd",
   2796                             header_len, expected_len);
   2797                mark_source_rp_bad(ms);
   2798                goto out;
   2799            }
   2800            migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
   2801            break;
   2802
   2803        case MIG_RP_MSG_RECV_BITMAP:
   2804            if (header_len < 1) {
   2805                error_report("%s: missing block name", __func__);
   2806                mark_source_rp_bad(ms);
   2807                goto out;
   2808            }
   2809            /* Format: len (1B) + idstr (<255B). This ends the idstr. */
   2810            buf[buf[0] + 1] = '\0';
   2811            if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1))) {
   2812                mark_source_rp_bad(ms);
   2813                goto out;
   2814            }
   2815            break;
   2816
   2817        case MIG_RP_MSG_RESUME_ACK:
   2818            tmp32 = ldl_be_p(buf);
   2819            if (migrate_handle_rp_resume_ack(ms, tmp32)) {
   2820                mark_source_rp_bad(ms);
   2821                goto out;
   2822            }
   2823            break;
   2824
   2825        default:
   2826            break;
   2827        }
   2828    }
   2829
   2830out:
   2831    res = qemu_file_get_error(rp);
   2832    if (res) {
   2833        if (res == -EIO && migration_in_postcopy()) {
   2834            /*
   2835             * Maybe there is something we can do: it looks like a
   2836             * network down issue, and we pause for a recovery.
   2837             */
   2838            migration_release_from_dst_file(ms);
   2839            rp = NULL;
   2840            if (postcopy_pause_return_path_thread(ms)) {
   2841                /*
   2842                 * Reload rp, reset the rest.  Referencing it is safe since
   2843                 * it's reset only by us above, or when migration completes
   2844                 */
   2845                rp = ms->rp_state.from_dst_file;
   2846                ms->rp_state.error = false;
   2847                goto retry;
   2848            }
   2849        }
   2850
   2851        trace_source_return_path_thread_bad_end();
   2852        mark_source_rp_bad(ms);
   2853    }
   2854
   2855    trace_source_return_path_thread_end();
   2856    migration_release_from_dst_file(ms);
   2857    rcu_unregister_thread();
   2858    return NULL;
   2859}
   2860
   2861static int open_return_path_on_source(MigrationState *ms,
   2862                                      bool create_thread)
   2863{
   2864    ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
   2865    if (!ms->rp_state.from_dst_file) {
   2866        return -1;
   2867    }
   2868
   2869    trace_open_return_path_on_source();
   2870
   2871    if (!create_thread) {
   2872        /* We're done */
   2873        return 0;
   2874    }
   2875
   2876    qemu_thread_create(&ms->rp_state.rp_thread, "return path",
   2877                       source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
   2878    ms->rp_state.rp_thread_created = true;
   2879
   2880    trace_open_return_path_on_source_continue();
   2881
   2882    return 0;
   2883}
   2884
   2885/* Returns 0 if the RP was ok, otherwise there was an error on the RP */
   2886static int await_return_path_close_on_source(MigrationState *ms)
   2887{
   2888    /*
   2889     * If this is a normal exit then the destination will send a SHUT and the
   2890     * rp_thread will exit, however if there's an error we need to cause
   2891     * it to exit.
   2892     */
   2893    if (qemu_file_get_error(ms->to_dst_file) && ms->rp_state.from_dst_file) {
   2894        /*
   2895         * shutdown(2), if we have it, will cause it to unblock if it's stuck
   2896         * waiting for the destination.
   2897         */
   2898        qemu_file_shutdown(ms->rp_state.from_dst_file);
   2899        mark_source_rp_bad(ms);
   2900    }
   2901    trace_await_return_path_close_on_source_joining();
   2902    qemu_thread_join(&ms->rp_state.rp_thread);
   2903    ms->rp_state.rp_thread_created = false;
   2904    trace_await_return_path_close_on_source_close();
   2905    return ms->rp_state.error;
   2906}
   2907
   2908/*
   2909 * Switch from normal iteration to postcopy
   2910 * Returns non-0 on error
   2911 */
   2912static int postcopy_start(MigrationState *ms)
   2913{
   2914    int ret;
   2915    QIOChannelBuffer *bioc;
   2916    QEMUFile *fb;
   2917    int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
   2918    int64_t bandwidth = migrate_max_postcopy_bandwidth();
   2919    bool restart_block = false;
   2920    int cur_state = MIGRATION_STATUS_ACTIVE;
   2921    if (!migrate_pause_before_switchover()) {
   2922        migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
   2923                          MIGRATION_STATUS_POSTCOPY_ACTIVE);
   2924    }
   2925
   2926    trace_postcopy_start();
   2927    qemu_mutex_lock_iothread();
   2928    trace_postcopy_start_set_run();
   2929
   2930    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
   2931    global_state_store();
   2932    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
   2933    if (ret < 0) {
   2934        goto fail;
   2935    }
   2936
   2937    ret = migration_maybe_pause(ms, &cur_state,
   2938                                MIGRATION_STATUS_POSTCOPY_ACTIVE);
   2939    if (ret < 0) {
   2940        goto fail;
   2941    }
   2942
   2943    ret = bdrv_inactivate_all();
   2944    if (ret < 0) {
   2945        goto fail;
   2946    }
   2947    restart_block = true;
   2948
   2949    /*
   2950     * Cause any non-postcopiable, but iterative devices to
   2951     * send out their final data.
   2952     */
   2953    qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false);
   2954
   2955    /*
   2956     * in Finish migrate and with the io-lock held everything should
   2957     * be quiet, but we've potentially still got dirty pages and we
   2958     * need to tell the destination to throw any pages it's already received
   2959     * that are dirty
   2960     */
   2961    if (migrate_postcopy_ram()) {
   2962        if (ram_postcopy_send_discard_bitmap(ms)) {
   2963            error_report("postcopy send discard bitmap failed");
   2964            goto fail;
   2965        }
   2966    }
   2967
   2968    /*
   2969     * send rest of state - note things that are doing postcopy
   2970     * will notice we're in POSTCOPY_ACTIVE and not actually
   2971     * wrap their state up here
   2972     */
   2973    /* 0 max-postcopy-bandwidth means unlimited */
   2974    if (!bandwidth) {
   2975        qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX);
   2976    } else {
   2977        qemu_file_set_rate_limit(ms->to_dst_file, bandwidth / XFER_LIMIT_RATIO);
   2978    }
   2979    if (migrate_postcopy_ram()) {
   2980        /* Ping just for debugging, helps line traces up */
   2981        qemu_savevm_send_ping(ms->to_dst_file, 2);
   2982    }
   2983
   2984    /*
   2985     * While loading the device state we may trigger page transfer
   2986     * requests and the fd must be free to process those, and thus
   2987     * the destination must read the whole device state off the fd before
   2988     * it starts processing it.  Unfortunately the ad-hoc migration format
   2989     * doesn't allow the destination to know the size to read without fully
   2990     * parsing it through each devices load-state code (especially the open
   2991     * coded devices that use get/put).
   2992     * So we wrap the device state up in a package with a length at the start;
   2993     * to do this we use a qemu_buf to hold the whole of the device state.
   2994     */
   2995    bioc = qio_channel_buffer_new(4096);
   2996    qio_channel_set_name(QIO_CHANNEL(bioc), "migration-postcopy-buffer");
   2997    fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc));
   2998    object_unref(OBJECT(bioc));
   2999
   3000    /*
   3001     * Make sure the receiver can get incoming pages before we send the rest
   3002     * of the state
   3003     */
   3004    qemu_savevm_send_postcopy_listen(fb);
   3005
   3006    qemu_savevm_state_complete_precopy(fb, false, false);
   3007    if (migrate_postcopy_ram()) {
   3008        qemu_savevm_send_ping(fb, 3);
   3009    }
   3010
   3011    qemu_savevm_send_postcopy_run(fb);
   3012
   3013    /* <><> end of stuff going into the package */
   3014
   3015    /* Last point of recovery; as soon as we send the package the destination
   3016     * can open devices and potentially start running.
   3017     * Lets just check again we've not got any errors.
   3018     */
   3019    ret = qemu_file_get_error(ms->to_dst_file);
   3020    if (ret) {
   3021        error_report("postcopy_start: Migration stream errored (pre package)");
   3022        goto fail_closefb;
   3023    }
   3024
   3025    restart_block = false;
   3026
   3027    /* Now send that blob */
   3028    if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) {
   3029        goto fail_closefb;
   3030    }
   3031    qemu_fclose(fb);
   3032
   3033    /* Send a notify to give a chance for anything that needs to happen
   3034     * at the transition to postcopy and after the device state; in particular
   3035     * spice needs to trigger a transition now
   3036     */
   3037    ms->postcopy_after_devices = true;
   3038    notifier_list_notify(&migration_state_notifiers, ms);
   3039
   3040    ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
   3041
   3042    qemu_mutex_unlock_iothread();
   3043
   3044    if (migrate_postcopy_ram()) {
   3045        /*
   3046         * Although this ping is just for debug, it could potentially be
   3047         * used for getting a better measurement of downtime at the source.
   3048         */
   3049        qemu_savevm_send_ping(ms->to_dst_file, 4);
   3050    }
   3051
   3052    if (migrate_release_ram()) {
   3053        ram_postcopy_migrated_memory_release(ms);
   3054    }
   3055
   3056    ret = qemu_file_get_error(ms->to_dst_file);
   3057    if (ret) {
   3058        error_report("postcopy_start: Migration stream errored");
   3059        migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
   3060                              MIGRATION_STATUS_FAILED);
   3061    }
   3062
   3063    return ret;
   3064
   3065fail_closefb:
   3066    qemu_fclose(fb);
   3067fail:
   3068    migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
   3069                          MIGRATION_STATUS_FAILED);
   3070    if (restart_block) {
   3071        /* A failure happened early enough that we know the destination hasn't
   3072         * accessed block devices, so we're safe to recover.
   3073         */
   3074        Error *local_err = NULL;
   3075
   3076        bdrv_invalidate_cache_all(&local_err);
   3077        if (local_err) {
   3078            error_report_err(local_err);
   3079        }
   3080    }
   3081    qemu_mutex_unlock_iothread();
   3082    return -1;
   3083}
   3084
   3085/**
   3086 * migration_maybe_pause: Pause if required to by
   3087 * migrate_pause_before_switchover called with the iothread locked
   3088 * Returns: 0 on success
   3089 */
   3090static int migration_maybe_pause(MigrationState *s,
   3091                                 int *current_active_state,
   3092                                 int new_state)
   3093{
   3094    if (!migrate_pause_before_switchover()) {
   3095        return 0;
   3096    }
   3097
   3098    /* Since leaving this state is not atomic with posting the semaphore
   3099     * it's possible that someone could have issued multiple migrate_continue
   3100     * and the semaphore is incorrectly positive at this point;
   3101     * the docs say it's undefined to reinit a semaphore that's already
   3102     * init'd, so use timedwait to eat up any existing posts.
   3103     */
   3104    while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) {
   3105        /* This block intentionally left blank */
   3106    }
   3107
   3108    /*
   3109     * If the migration is cancelled when it is in the completion phase,
   3110     * the migration state is set to MIGRATION_STATUS_CANCELLING.
   3111     * So we don't need to wait a semaphore, otherwise we would always
   3112     * wait for the 'pause_sem' semaphore.
   3113     */
   3114    if (s->state != MIGRATION_STATUS_CANCELLING) {
   3115        qemu_mutex_unlock_iothread();
   3116        migrate_set_state(&s->state, *current_active_state,
   3117                          MIGRATION_STATUS_PRE_SWITCHOVER);
   3118        qemu_sem_wait(&s->pause_sem);
   3119        migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER,
   3120                          new_state);
   3121        *current_active_state = new_state;
   3122        qemu_mutex_lock_iothread();
   3123    }
   3124
   3125    return s->state == new_state ? 0 : -EINVAL;
   3126}
   3127
   3128/**
   3129 * migration_completion: Used by migration_thread when there's not much left.
   3130 *   The caller 'breaks' the loop when this returns.
   3131 *
   3132 * @s: Current migration state
   3133 */
   3134static void migration_completion(MigrationState *s)
   3135{
   3136    int ret;
   3137    int current_active_state = s->state;
   3138
   3139    if (s->state == MIGRATION_STATUS_ACTIVE) {
   3140        qemu_mutex_lock_iothread();
   3141        s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
   3142        qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
   3143        s->vm_was_running = runstate_is_running();
   3144        ret = global_state_store();
   3145
   3146        if (!ret) {
   3147            bool inactivate = !migrate_colo_enabled();
   3148            ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
   3149            trace_migration_completion_vm_stop(ret);
   3150            if (ret >= 0) {
   3151                ret = migration_maybe_pause(s, &current_active_state,
   3152                                            MIGRATION_STATUS_DEVICE);
   3153            }
   3154            if (ret >= 0) {
   3155                qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
   3156                ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
   3157                                                         inactivate);
   3158            }
   3159            if (inactivate && ret >= 0) {
   3160                s->block_inactive = true;
   3161            }
   3162        }
   3163        qemu_mutex_unlock_iothread();
   3164
   3165        if (ret < 0) {
   3166            goto fail;
   3167        }
   3168    } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
   3169        trace_migration_completion_postcopy_end();
   3170
   3171        qemu_mutex_lock_iothread();
   3172        qemu_savevm_state_complete_postcopy(s->to_dst_file);
   3173        qemu_mutex_unlock_iothread();
   3174
   3175        trace_migration_completion_postcopy_end_after_complete();
   3176    } else if (s->state == MIGRATION_STATUS_CANCELLING) {
   3177        goto fail;
   3178    }
   3179
   3180    /*
   3181     * If rp was opened we must clean up the thread before
   3182     * cleaning everything else up (since if there are no failures
   3183     * it will wait for the destination to send it's status in
   3184     * a SHUT command).
   3185     */
   3186    if (s->rp_state.rp_thread_created) {
   3187        int rp_error;
   3188        trace_migration_return_path_end_before();
   3189        rp_error = await_return_path_close_on_source(s);
   3190        trace_migration_return_path_end_after(rp_error);
   3191        if (rp_error) {
   3192            goto fail_invalidate;
   3193        }
   3194    }
   3195
   3196    if (qemu_file_get_error(s->to_dst_file)) {
   3197        trace_migration_completion_file_err();
   3198        goto fail_invalidate;
   3199    }
   3200
   3201    if (!migrate_colo_enabled()) {
   3202        migrate_set_state(&s->state, current_active_state,
   3203                          MIGRATION_STATUS_COMPLETED);
   3204    }
   3205
   3206    return;
   3207
   3208fail_invalidate:
   3209    /* If not doing postcopy, vm_start() will be called: let's regain
   3210     * control on images.
   3211     */
   3212    if (s->state == MIGRATION_STATUS_ACTIVE ||
   3213        s->state == MIGRATION_STATUS_DEVICE) {
   3214        Error *local_err = NULL;
   3215
   3216        qemu_mutex_lock_iothread();
   3217        bdrv_invalidate_cache_all(&local_err);
   3218        if (local_err) {
   3219            error_report_err(local_err);
   3220        } else {
   3221            s->block_inactive = false;
   3222        }
   3223        qemu_mutex_unlock_iothread();
   3224    }
   3225
   3226fail:
   3227    migrate_set_state(&s->state, current_active_state,
   3228                      MIGRATION_STATUS_FAILED);
   3229}
   3230
   3231/**
   3232 * bg_migration_completion: Used by bg_migration_thread when after all the
   3233 *   RAM has been saved. The caller 'breaks' the loop when this returns.
   3234 *
   3235 * @s: Current migration state
   3236 */
   3237static void bg_migration_completion(MigrationState *s)
   3238{
   3239    int current_active_state = s->state;
   3240
   3241    /*
   3242     * Stop tracking RAM writes - un-protect memory, un-register UFFD
   3243     * memory ranges, flush kernel wait queues and wake up threads
   3244     * waiting for write fault to be resolved.
   3245     */
   3246    ram_write_tracking_stop();
   3247
   3248    if (s->state == MIGRATION_STATUS_ACTIVE) {
   3249        /*
   3250         * By this moment we have RAM content saved into the migration stream.
   3251         * The next step is to flush the non-RAM content (device state)
   3252         * right after the ram content. The device state has been stored into
   3253         * the temporary buffer before RAM saving started.
   3254         */
   3255        qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage);
   3256        qemu_fflush(s->to_dst_file);
   3257    } else if (s->state == MIGRATION_STATUS_CANCELLING) {
   3258        goto fail;
   3259    }
   3260
   3261    if (qemu_file_get_error(s->to_dst_file)) {
   3262        trace_migration_completion_file_err();
   3263        goto fail;
   3264    }
   3265
   3266    migrate_set_state(&s->state, current_active_state,
   3267                      MIGRATION_STATUS_COMPLETED);
   3268    return;
   3269
   3270fail:
   3271    migrate_set_state(&s->state, current_active_state,
   3272                      MIGRATION_STATUS_FAILED);
   3273}
   3274
   3275bool migrate_colo_enabled(void)
   3276{
   3277    MigrationState *s = migrate_get_current();
   3278    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO];
   3279}
   3280
   3281typedef enum MigThrError {
   3282    /* No error detected */
   3283    MIG_THR_ERR_NONE = 0,
   3284    /* Detected error, but resumed successfully */
   3285    MIG_THR_ERR_RECOVERED = 1,
   3286    /* Detected fatal error, need to exit */
   3287    MIG_THR_ERR_FATAL = 2,
   3288} MigThrError;
   3289
   3290static int postcopy_resume_handshake(MigrationState *s)
   3291{
   3292    qemu_savevm_send_postcopy_resume(s->to_dst_file);
   3293
   3294    while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
   3295        qemu_sem_wait(&s->rp_state.rp_sem);
   3296    }
   3297
   3298    if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
   3299        return 0;
   3300    }
   3301
   3302    return -1;
   3303}
   3304
   3305/* Return zero if success, or <0 for error */
   3306static int postcopy_do_resume(MigrationState *s)
   3307{
   3308    int ret;
   3309
   3310    /*
   3311     * Call all the resume_prepare() hooks, so that modules can be
   3312     * ready for the migration resume.
   3313     */
   3314    ret = qemu_savevm_state_resume_prepare(s);
   3315    if (ret) {
   3316        error_report("%s: resume_prepare() failure detected: %d",
   3317                     __func__, ret);
   3318        return ret;
   3319    }
   3320
   3321    /*
   3322     * Last handshake with destination on the resume (destination will
   3323     * switch to postcopy-active afterwards)
   3324     */
   3325    ret = postcopy_resume_handshake(s);
   3326    if (ret) {
   3327        error_report("%s: handshake failed: %d", __func__, ret);
   3328        return ret;
   3329    }
   3330
   3331    return 0;
   3332}
   3333
   3334/*
   3335 * We don't return until we are in a safe state to continue current
   3336 * postcopy migration.  Returns MIG_THR_ERR_RECOVERED if recovered, or
   3337 * MIG_THR_ERR_FATAL if unrecovery failure happened.
   3338 */
   3339static MigThrError postcopy_pause(MigrationState *s)
   3340{
   3341    assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
   3342
   3343    while (true) {
   3344        QEMUFile *file;
   3345
   3346        /*
   3347         * Current channel is possibly broken. Release it.  Note that this is
   3348         * guaranteed even without lock because to_dst_file should only be
   3349         * modified by the migration thread.  That also guarantees that the
   3350         * unregister of yank is safe too without the lock.  It should be safe
   3351         * even to be within the qemu_file_lock, but we didn't do that to avoid
   3352         * taking more mutex (yank_lock) within qemu_file_lock.  TL;DR: we make
   3353         * the qemu_file_lock critical section as small as possible.
   3354         */
   3355        assert(s->to_dst_file);
   3356        migration_ioc_unregister_yank_from_file(s->to_dst_file);
   3357        qemu_mutex_lock(&s->qemu_file_lock);
   3358        file = s->to_dst_file;
   3359        s->to_dst_file = NULL;
   3360        qemu_mutex_unlock(&s->qemu_file_lock);
   3361
   3362        qemu_file_shutdown(file);
   3363        qemu_fclose(file);
   3364
   3365        migrate_set_state(&s->state, s->state,
   3366                          MIGRATION_STATUS_POSTCOPY_PAUSED);
   3367
   3368        error_report("Detected IO failure for postcopy. "
   3369                     "Migration paused.");
   3370
   3371        /*
   3372         * We wait until things fixed up. Then someone will setup the
   3373         * status back for us.
   3374         */
   3375        while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
   3376            qemu_sem_wait(&s->postcopy_pause_sem);
   3377        }
   3378
   3379        if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
   3380            /* Woken up by a recover procedure. Give it a shot */
   3381
   3382            /*
   3383             * Firstly, let's wake up the return path now, with a new
   3384             * return path channel.
   3385             */
   3386            qemu_sem_post(&s->postcopy_pause_rp_sem);
   3387
   3388            /* Do the resume logic */
   3389            if (postcopy_do_resume(s) == 0) {
   3390                /* Let's continue! */
   3391                trace_postcopy_pause_continued();
   3392                return MIG_THR_ERR_RECOVERED;
   3393            } else {
   3394                /*
   3395                 * Something wrong happened during the recovery, let's
   3396                 * pause again. Pause is always better than throwing
   3397                 * data away.
   3398                 */
   3399                continue;
   3400            }
   3401        } else {
   3402            /* This is not right... Time to quit. */
   3403            return MIG_THR_ERR_FATAL;
   3404        }
   3405    }
   3406}
   3407
   3408static MigThrError migration_detect_error(MigrationState *s)
   3409{
   3410    int ret;
   3411    int state = s->state;
   3412    Error *local_error = NULL;
   3413
   3414    if (state == MIGRATION_STATUS_CANCELLING ||
   3415        state == MIGRATION_STATUS_CANCELLED) {
   3416        /* End the migration, but don't set the state to failed */
   3417        return MIG_THR_ERR_FATAL;
   3418    }
   3419
   3420    /* Try to detect any file errors */
   3421    ret = qemu_file_get_error_obj(s->to_dst_file, &local_error);
   3422    if (!ret) {
   3423        /* Everything is fine */
   3424        assert(!local_error);
   3425        return MIG_THR_ERR_NONE;
   3426    }
   3427
   3428    if (local_error) {
   3429        migrate_set_error(s, local_error);
   3430        error_free(local_error);
   3431    }
   3432
   3433    if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) {
   3434        /*
   3435         * For postcopy, we allow the network to be down for a
   3436         * while. After that, it can be continued by a
   3437         * recovery phase.
   3438         */
   3439        return postcopy_pause(s);
   3440    } else {
   3441        /*
   3442         * For precopy (or postcopy with error outside IO), we fail
   3443         * with no time.
   3444         */
   3445        migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED);
   3446        trace_migration_thread_file_err();
   3447
   3448        /* Time to stop the migration, now. */
   3449        return MIG_THR_ERR_FATAL;
   3450    }
   3451}
   3452
   3453/* How many bytes have we transferred since the beginning of the migration */
   3454static uint64_t migration_total_bytes(MigrationState *s)
   3455{
   3456    return qemu_ftell(s->to_dst_file) + ram_counters.multifd_bytes;
   3457}
   3458
   3459static void migration_calculate_complete(MigrationState *s)
   3460{
   3461    uint64_t bytes = migration_total_bytes(s);
   3462    int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
   3463    int64_t transfer_time;
   3464
   3465    s->total_time = end_time - s->start_time;
   3466    if (!s->downtime) {
   3467        /*
   3468         * It's still not set, so we are precopy migration.  For
   3469         * postcopy, downtime is calculated during postcopy_start().
   3470         */
   3471        s->downtime = end_time - s->downtime_start;
   3472    }
   3473
   3474    transfer_time = s->total_time - s->setup_time;
   3475    if (transfer_time) {
   3476        s->mbps = ((double) bytes * 8.0) / transfer_time / 1000;
   3477    }
   3478}
   3479
   3480static void update_iteration_initial_status(MigrationState *s)
   3481{
   3482    /*
   3483     * Update these three fields at the same time to avoid mismatch info lead
   3484     * wrong speed calculation.
   3485     */
   3486    s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
   3487    s->iteration_initial_bytes = migration_total_bytes(s);
   3488    s->iteration_initial_pages = ram_get_total_transferred_pages();
   3489}
   3490
   3491static void migration_update_counters(MigrationState *s,
   3492                                      int64_t current_time)
   3493{
   3494    uint64_t transferred, transferred_pages, time_spent;
   3495    uint64_t current_bytes; /* bytes transferred since the beginning */
   3496    double bandwidth;
   3497
   3498    if (current_time < s->iteration_start_time + BUFFER_DELAY) {
   3499        return;
   3500    }
   3501
   3502    current_bytes = migration_total_bytes(s);
   3503    transferred = current_bytes - s->iteration_initial_bytes;
   3504    time_spent = current_time - s->iteration_start_time;
   3505    bandwidth = (double)transferred / time_spent;
   3506    s->threshold_size = bandwidth * s->parameters.downtime_limit;
   3507
   3508    s->mbps = (((double) transferred * 8.0) /
   3509               ((double) time_spent / 1000.0)) / 1000.0 / 1000.0;
   3510
   3511    transferred_pages = ram_get_total_transferred_pages() -
   3512                            s->iteration_initial_pages;
   3513    s->pages_per_second = (double) transferred_pages /
   3514                             (((double) time_spent / 1000.0));
   3515
   3516    /*
   3517     * if we haven't sent anything, we don't want to
   3518     * recalculate. 10000 is a small enough number for our purposes
   3519     */
   3520    if (ram_counters.dirty_pages_rate && transferred > 10000) {
   3521        s->expected_downtime = ram_counters.remaining / bandwidth;
   3522    }
   3523
   3524    qemu_file_reset_rate_limit(s->to_dst_file);
   3525
   3526    update_iteration_initial_status(s);
   3527
   3528    trace_migrate_transferred(transferred, time_spent,
   3529                              bandwidth, s->threshold_size);
   3530}
   3531
   3532/* Migration thread iteration status */
   3533typedef enum {
   3534    MIG_ITERATE_RESUME,         /* Resume current iteration */
   3535    MIG_ITERATE_SKIP,           /* Skip current iteration */
   3536    MIG_ITERATE_BREAK,          /* Break the loop */
   3537} MigIterateState;
   3538
   3539/*
   3540 * Return true if continue to the next iteration directly, false
   3541 * otherwise.
   3542 */
   3543static MigIterateState migration_iteration_run(MigrationState *s)
   3544{
   3545    uint64_t pending_size, pend_pre, pend_compat, pend_post;
   3546    bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
   3547
   3548    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
   3549                              &pend_compat, &pend_post);
   3550    pending_size = pend_pre + pend_compat + pend_post;
   3551
   3552    trace_migrate_pending(pending_size, s->threshold_size,
   3553                          pend_pre, pend_compat, pend_post);
   3554
   3555    if (pending_size && pending_size >= s->threshold_size) {
   3556        /* Still a significant amount to transfer */
   3557        if (!in_postcopy && pend_pre <= s->threshold_size &&
   3558            qatomic_read(&s->start_postcopy)) {
   3559            if (postcopy_start(s)) {
   3560                error_report("%s: postcopy failed to start", __func__);
   3561            }
   3562            return MIG_ITERATE_SKIP;
   3563        }
   3564        /* Just another iteration step */
   3565        qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
   3566    } else {
   3567        trace_migration_thread_low_pending(pending_size);
   3568        migration_completion(s);
   3569        return MIG_ITERATE_BREAK;
   3570    }
   3571
   3572    return MIG_ITERATE_RESUME;
   3573}
   3574
   3575static void migration_iteration_finish(MigrationState *s)
   3576{
   3577    /* If we enabled cpu throttling for auto-converge, turn it off. */
   3578    cpu_throttle_stop();
   3579
   3580    qemu_mutex_lock_iothread();
   3581    switch (s->state) {
   3582    case MIGRATION_STATUS_COMPLETED:
   3583        migration_calculate_complete(s);
   3584        runstate_set(RUN_STATE_POSTMIGRATE);
   3585        break;
   3586
   3587    case MIGRATION_STATUS_ACTIVE:
   3588        /*
   3589         * We should really assert here, but since it's during
   3590         * migration, let's try to reduce the usage of assertions.
   3591         */
   3592        if (!migrate_colo_enabled()) {
   3593            error_report("%s: critical error: calling COLO code without "
   3594                         "COLO enabled", __func__);
   3595        }
   3596        migrate_start_colo_process(s);
   3597        /*
   3598         * Fixme: we will run VM in COLO no matter its old running state.
   3599         * After exited COLO, we will keep running.
   3600         */
   3601        s->vm_was_running = true;
   3602        /* Fallthrough */
   3603    case MIGRATION_STATUS_FAILED:
   3604    case MIGRATION_STATUS_CANCELLED:
   3605    case MIGRATION_STATUS_CANCELLING:
   3606        if (s->vm_was_running) {
   3607            vm_start();
   3608        } else {
   3609            if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
   3610                runstate_set(RUN_STATE_POSTMIGRATE);
   3611            }
   3612        }
   3613        break;
   3614
   3615    default:
   3616        /* Should not reach here, but if so, forgive the VM. */
   3617        error_report("%s: Unknown ending state %d", __func__, s->state);
   3618        break;
   3619    }
   3620    migrate_fd_cleanup_schedule(s);
   3621    qemu_mutex_unlock_iothread();
   3622}
   3623
   3624static void bg_migration_iteration_finish(MigrationState *s)
   3625{
   3626    qemu_mutex_lock_iothread();
   3627    switch (s->state) {
   3628    case MIGRATION_STATUS_COMPLETED:
   3629        migration_calculate_complete(s);
   3630        break;
   3631
   3632    case MIGRATION_STATUS_ACTIVE:
   3633    case MIGRATION_STATUS_FAILED:
   3634    case MIGRATION_STATUS_CANCELLED:
   3635    case MIGRATION_STATUS_CANCELLING:
   3636        break;
   3637
   3638    default:
   3639        /* Should not reach here, but if so, forgive the VM. */
   3640        error_report("%s: Unknown ending state %d", __func__, s->state);
   3641        break;
   3642    }
   3643
   3644    migrate_fd_cleanup_schedule(s);
   3645    qemu_mutex_unlock_iothread();
   3646}
   3647
   3648/*
   3649 * Return true if continue to the next iteration directly, false
   3650 * otherwise.
   3651 */
   3652static MigIterateState bg_migration_iteration_run(MigrationState *s)
   3653{
   3654    int res;
   3655
   3656    res = qemu_savevm_state_iterate(s->to_dst_file, false);
   3657    if (res > 0) {
   3658        bg_migration_completion(s);
   3659        return MIG_ITERATE_BREAK;
   3660    }
   3661
   3662    return MIG_ITERATE_RESUME;
   3663}
   3664
   3665void migration_make_urgent_request(void)
   3666{
   3667    qemu_sem_post(&migrate_get_current()->rate_limit_sem);
   3668}
   3669
   3670void migration_consume_urgent_request(void)
   3671{
   3672    qemu_sem_wait(&migrate_get_current()->rate_limit_sem);
   3673}
   3674
   3675/* Returns true if the rate limiting was broken by an urgent request */
   3676bool migration_rate_limit(void)
   3677{
   3678    int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
   3679    MigrationState *s = migrate_get_current();
   3680
   3681    bool urgent = false;
   3682    migration_update_counters(s, now);
   3683    if (qemu_file_rate_limit(s->to_dst_file)) {
   3684
   3685        if (qemu_file_get_error(s->to_dst_file)) {
   3686            return false;
   3687        }
   3688        /*
   3689         * Wait for a delay to do rate limiting OR
   3690         * something urgent to post the semaphore.
   3691         */
   3692        int ms = s->iteration_start_time + BUFFER_DELAY - now;
   3693        trace_migration_rate_limit_pre(ms);
   3694        if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) {
   3695            /*
   3696             * We were woken by one or more urgent things but
   3697             * the timedwait will have consumed one of them.
   3698             * The service routine for the urgent wake will dec
   3699             * the semaphore itself for each item it consumes,
   3700             * so add this one we just eat back.
   3701             */
   3702            qemu_sem_post(&s->rate_limit_sem);
   3703            urgent = true;
   3704        }
   3705        trace_migration_rate_limit_post(urgent);
   3706    }
   3707    return urgent;
   3708}
   3709
   3710/*
   3711 * if failover devices are present, wait they are completely
   3712 * unplugged
   3713 */
   3714
   3715static void qemu_savevm_wait_unplug(MigrationState *s, int old_state,
   3716                                    int new_state)
   3717{
   3718    if (qemu_savevm_state_guest_unplug_pending()) {
   3719        migrate_set_state(&s->state, old_state, MIGRATION_STATUS_WAIT_UNPLUG);
   3720
   3721        while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
   3722               qemu_savevm_state_guest_unplug_pending()) {
   3723            qemu_sem_timedwait(&s->wait_unplug_sem, 250);
   3724        }
   3725        if (s->state != MIGRATION_STATUS_WAIT_UNPLUG) {
   3726            int timeout = 120; /* 30 seconds */
   3727            /*
   3728             * migration has been canceled
   3729             * but as we have started an unplug we must wait the end
   3730             * to be able to plug back the card
   3731             */
   3732            while (timeout-- && qemu_savevm_state_guest_unplug_pending()) {
   3733                qemu_sem_timedwait(&s->wait_unplug_sem, 250);
   3734            }
   3735            if (qemu_savevm_state_guest_unplug_pending()) {
   3736                warn_report("migration: partially unplugged device on "
   3737                            "failure");
   3738            }
   3739        }
   3740
   3741        migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, new_state);
   3742    } else {
   3743        migrate_set_state(&s->state, old_state, new_state);
   3744    }
   3745}
   3746
   3747/*
   3748 * Master migration thread on the source VM.
   3749 * It drives the migration and pumps the data down the outgoing channel.
   3750 */
   3751static void *migration_thread(void *opaque)
   3752{
   3753    MigrationState *s = opaque;
   3754    int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
   3755    MigThrError thr_error;
   3756    bool urgent = false;
   3757
   3758    rcu_register_thread();
   3759
   3760    object_ref(OBJECT(s));
   3761    update_iteration_initial_status(s);
   3762
   3763    qemu_savevm_state_header(s->to_dst_file);
   3764
   3765    /*
   3766     * If we opened the return path, we need to make sure dst has it
   3767     * opened as well.
   3768     */
   3769    if (s->rp_state.rp_thread_created) {
   3770        /* Now tell the dest that it should open its end so it can reply */
   3771        qemu_savevm_send_open_return_path(s->to_dst_file);
   3772
   3773        /* And do a ping that will make stuff easier to debug */
   3774        qemu_savevm_send_ping(s->to_dst_file, 1);
   3775    }
   3776
   3777    if (migrate_postcopy()) {
   3778        /*
   3779         * Tell the destination that we *might* want to do postcopy later;
   3780         * if the other end can't do postcopy it should fail now, nice and
   3781         * early.
   3782         */
   3783        qemu_savevm_send_postcopy_advise(s->to_dst_file);
   3784    }
   3785
   3786    if (migrate_colo_enabled()) {
   3787        /* Notify migration destination that we enable COLO */
   3788        qemu_savevm_send_colo_enable(s->to_dst_file);
   3789    }
   3790
   3791    qemu_savevm_state_setup(s->to_dst_file);
   3792
   3793    qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
   3794                               MIGRATION_STATUS_ACTIVE);
   3795
   3796    s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
   3797
   3798    trace_migration_thread_setup_complete();
   3799
   3800    while (migration_is_active(s)) {
   3801        if (urgent || !qemu_file_rate_limit(s->to_dst_file)) {
   3802            MigIterateState iter_state = migration_iteration_run(s);
   3803            if (iter_state == MIG_ITERATE_SKIP) {
   3804                continue;
   3805            } else if (iter_state == MIG_ITERATE_BREAK) {
   3806                break;
   3807            }
   3808        }
   3809
   3810        /*
   3811         * Try to detect any kind of failures, and see whether we
   3812         * should stop the migration now.
   3813         */
   3814        thr_error = migration_detect_error(s);
   3815        if (thr_error == MIG_THR_ERR_FATAL) {
   3816            /* Stop migration */
   3817            break;
   3818        } else if (thr_error == MIG_THR_ERR_RECOVERED) {
   3819            /*
   3820             * Just recovered from a e.g. network failure, reset all
   3821             * the local variables. This is important to avoid
   3822             * breaking transferred_bytes and bandwidth calculation
   3823             */
   3824            update_iteration_initial_status(s);
   3825        }
   3826
   3827        urgent = migration_rate_limit();
   3828    }
   3829
   3830    trace_migration_thread_after_loop();
   3831    migration_iteration_finish(s);
   3832    object_unref(OBJECT(s));
   3833    rcu_unregister_thread();
   3834    return NULL;
   3835}
   3836
   3837static void bg_migration_vm_start_bh(void *opaque)
   3838{
   3839    MigrationState *s = opaque;
   3840
   3841    qemu_bh_delete(s->vm_start_bh);
   3842    s->vm_start_bh = NULL;
   3843
   3844    vm_start();
   3845    s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start;
   3846}
   3847
   3848/**
   3849 * Background snapshot thread, based on live migration code.
   3850 * This is an alternative implementation of live migration mechanism
   3851 * introduced specifically to support background snapshots.
   3852 *
   3853 * It takes advantage of userfault_fd write protection mechanism introduced
   3854 * in v5.7 kernel. Compared to existing dirty page logging migration much
   3855 * lesser stream traffic is produced resulting in smaller snapshot images,
   3856 * simply cause of no page duplicates can get into the stream.
   3857 *
   3858 * Another key point is that generated vmstate stream reflects machine state
   3859 * 'frozen' at the beginning of snapshot creation compared to dirty page logging
   3860 * mechanism, which effectively results in that saved snapshot is the state of VM
   3861 * at the end of the process.
   3862 */
   3863static void *bg_migration_thread(void *opaque)
   3864{
   3865    MigrationState *s = opaque;
   3866    int64_t setup_start;
   3867    MigThrError thr_error;
   3868    QEMUFile *fb;
   3869    bool early_fail = true;
   3870
   3871    rcu_register_thread();
   3872    object_ref(OBJECT(s));
   3873
   3874    qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
   3875
   3876    setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
   3877    /*
   3878     * We want to save vmstate for the moment when migration has been
   3879     * initiated but also we want to save RAM content while VM is running.
   3880     * The RAM content should appear first in the vmstate. So, we first
   3881     * stash the non-RAM part of the vmstate to the temporary buffer,
   3882     * then write RAM part of the vmstate to the migration stream
   3883     * with vCPUs running and, finally, write stashed non-RAM part of
   3884     * the vmstate from the buffer to the migration stream.
   3885     */
   3886    s->bioc = qio_channel_buffer_new(512 * 1024);
   3887    qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer");
   3888    fb = qemu_fopen_channel_output(QIO_CHANNEL(s->bioc));
   3889    object_unref(OBJECT(s->bioc));
   3890
   3891    update_iteration_initial_status(s);
   3892
   3893    /*
   3894     * Prepare for tracking memory writes with UFFD-WP - populate
   3895     * RAM pages before protecting.
   3896     */
   3897#ifdef __linux__
   3898    ram_write_tracking_prepare();
   3899#endif
   3900
   3901    qemu_savevm_state_header(s->to_dst_file);
   3902    qemu_savevm_state_setup(s->to_dst_file);
   3903
   3904    qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
   3905                               MIGRATION_STATUS_ACTIVE);
   3906
   3907    s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
   3908
   3909    trace_migration_thread_setup_complete();
   3910    s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
   3911
   3912    qemu_mutex_lock_iothread();
   3913
   3914    /*
   3915     * If VM is currently in suspended state, then, to make a valid runstate
   3916     * transition in vm_stop_force_state() we need to wakeup it up.
   3917     */
   3918    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
   3919    s->vm_was_running = runstate_is_running();
   3920
   3921    if (global_state_store()) {
   3922        goto fail;
   3923    }
   3924    /* Forcibly stop VM before saving state of vCPUs and devices */
   3925    if (vm_stop_force_state(RUN_STATE_PAUSED)) {
   3926        goto fail;
   3927    }
   3928    /*
   3929     * Put vCPUs in sync with shadow context structures, then
   3930     * save their state to channel-buffer along with devices.
   3931     */
   3932    cpu_synchronize_all_states();
   3933    if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) {
   3934        goto fail;
   3935    }
   3936    /*
   3937     * Since we are going to get non-iterable state data directly
   3938     * from s->bioc->data, explicit flush is needed here.
   3939     */
   3940    qemu_fflush(fb);
   3941
   3942    /* Now initialize UFFD context and start tracking RAM writes */
   3943    if (ram_write_tracking_start()) {
   3944        goto fail;
   3945    }
   3946    early_fail = false;
   3947
   3948    /*
   3949     * Start VM from BH handler to avoid write-fault lock here.
   3950     * UFFD-WP protection for the whole RAM is already enabled so
   3951     * calling VM state change notifiers from vm_start() would initiate
   3952     * writes to virtio VQs memory which is in write-protected region.
   3953     */
   3954    s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s);
   3955    qemu_bh_schedule(s->vm_start_bh);
   3956
   3957    qemu_mutex_unlock_iothread();
   3958
   3959    while (migration_is_active(s)) {
   3960        MigIterateState iter_state = bg_migration_iteration_run(s);
   3961        if (iter_state == MIG_ITERATE_SKIP) {
   3962            continue;
   3963        } else if (iter_state == MIG_ITERATE_BREAK) {
   3964            break;
   3965        }
   3966
   3967        /*
   3968         * Try to detect any kind of failures, and see whether we
   3969         * should stop the migration now.
   3970         */
   3971        thr_error = migration_detect_error(s);
   3972        if (thr_error == MIG_THR_ERR_FATAL) {
   3973            /* Stop migration */
   3974            break;
   3975        }
   3976
   3977        migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
   3978    }
   3979
   3980    trace_migration_thread_after_loop();
   3981
   3982fail:
   3983    if (early_fail) {
   3984        migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
   3985                MIGRATION_STATUS_FAILED);
   3986        qemu_mutex_unlock_iothread();
   3987    }
   3988
   3989    bg_migration_iteration_finish(s);
   3990
   3991    qemu_fclose(fb);
   3992    object_unref(OBJECT(s));
   3993    rcu_unregister_thread();
   3994
   3995    return NULL;
   3996}
   3997
   3998void migrate_fd_connect(MigrationState *s, Error *error_in)
   3999{
   4000    Error *local_err = NULL;
   4001    int64_t rate_limit;
   4002    bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
   4003
   4004    /*
   4005     * If there's a previous error, free it and prepare for another one.
   4006     * Meanwhile if migration completes successfully, there won't have an error
   4007     * dumped when calling migrate_fd_cleanup().
   4008     */
   4009    migrate_error_free(s);
   4010
   4011    s->expected_downtime = s->parameters.downtime_limit;
   4012    if (resume) {
   4013        assert(s->cleanup_bh);
   4014    } else {
   4015        assert(!s->cleanup_bh);
   4016        s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup_bh, s);
   4017    }
   4018    if (error_in) {
   4019        migrate_fd_error(s, error_in);
   4020        if (resume) {
   4021            /*
   4022             * Don't do cleanup for resume if channel is invalid, but only dump
   4023             * the error.  We wait for another channel connect from the user.
   4024             * The error_report still gives HMP user a hint on what failed.
   4025             * It's normally done in migrate_fd_cleanup(), but call it here
   4026             * explicitly.
   4027             */
   4028            error_report_err(error_copy(s->error));
   4029        } else {
   4030            migrate_fd_cleanup(s);
   4031        }
   4032        return;
   4033    }
   4034
   4035    if (resume) {
   4036        /* This is a resumed migration */
   4037        rate_limit = s->parameters.max_postcopy_bandwidth /
   4038            XFER_LIMIT_RATIO;
   4039    } else {
   4040        /* This is a fresh new migration */
   4041        rate_limit = s->parameters.max_bandwidth / XFER_LIMIT_RATIO;
   4042
   4043        /* Notify before starting migration thread */
   4044        notifier_list_notify(&migration_state_notifiers, s);
   4045    }
   4046
   4047    qemu_file_set_rate_limit(s->to_dst_file, rate_limit);
   4048    qemu_file_set_blocking(s->to_dst_file, true);
   4049
   4050    /*
   4051     * Open the return path. For postcopy, it is used exclusively. For
   4052     * precopy, only if user specified "return-path" capability would
   4053     * QEMU uses the return path.
   4054     */
   4055    if (migrate_postcopy_ram() || migrate_use_return_path()) {
   4056        if (open_return_path_on_source(s, !resume)) {
   4057            error_report("Unable to open return-path for postcopy");
   4058            migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
   4059            migrate_fd_cleanup(s);
   4060            return;
   4061        }
   4062    }
   4063
   4064    if (resume) {
   4065        /* Wakeup the main migration thread to do the recovery */
   4066        migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
   4067                          MIGRATION_STATUS_POSTCOPY_RECOVER);
   4068        qemu_sem_post(&s->postcopy_pause_sem);
   4069        return;
   4070    }
   4071
   4072    if (multifd_save_setup(&local_err) != 0) {
   4073        error_report_err(local_err);
   4074        migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
   4075                          MIGRATION_STATUS_FAILED);
   4076        migrate_fd_cleanup(s);
   4077        return;
   4078    }
   4079
   4080    if (migrate_background_snapshot()) {
   4081        qemu_thread_create(&s->thread, "bg_snapshot",
   4082                bg_migration_thread, s, QEMU_THREAD_JOINABLE);
   4083    } else {
   4084        qemu_thread_create(&s->thread, "live_migration",
   4085                migration_thread, s, QEMU_THREAD_JOINABLE);
   4086    }
   4087    s->migration_thread_running = true;
   4088}
   4089
   4090void migration_global_dump(Monitor *mon)
   4091{
   4092    MigrationState *ms = migrate_get_current();
   4093
   4094    monitor_printf(mon, "globals:\n");
   4095    monitor_printf(mon, "store-global-state: %s\n",
   4096                   ms->store_global_state ? "on" : "off");
   4097    monitor_printf(mon, "only-migratable: %s\n",
   4098                   only_migratable ? "on" : "off");
   4099    monitor_printf(mon, "send-configuration: %s\n",
   4100                   ms->send_configuration ? "on" : "off");
   4101    monitor_printf(mon, "send-section-footer: %s\n",
   4102                   ms->send_section_footer ? "on" : "off");
   4103    monitor_printf(mon, "decompress-error-check: %s\n",
   4104                   ms->decompress_error_check ? "on" : "off");
   4105    monitor_printf(mon, "clear-bitmap-shift: %u\n",
   4106                   ms->clear_bitmap_shift);
   4107}
   4108
   4109#define DEFINE_PROP_MIG_CAP(name, x)             \
   4110    DEFINE_PROP_BOOL(name, MigrationState, enabled_capabilities[x], false)
   4111
   4112static Property migration_properties[] = {
   4113    DEFINE_PROP_BOOL("store-global-state", MigrationState,
   4114                     store_global_state, true),
   4115    DEFINE_PROP_BOOL("send-configuration", MigrationState,
   4116                     send_configuration, true),
   4117    DEFINE_PROP_BOOL("send-section-footer", MigrationState,
   4118                     send_section_footer, true),
   4119    DEFINE_PROP_BOOL("decompress-error-check", MigrationState,
   4120                      decompress_error_check, true),
   4121    DEFINE_PROP_UINT8("x-clear-bitmap-shift", MigrationState,
   4122                      clear_bitmap_shift, CLEAR_BITMAP_SHIFT_DEFAULT),
   4123
   4124    /* Migration parameters */
   4125    DEFINE_PROP_UINT8("x-compress-level", MigrationState,
   4126                      parameters.compress_level,
   4127                      DEFAULT_MIGRATE_COMPRESS_LEVEL),
   4128    DEFINE_PROP_UINT8("x-compress-threads", MigrationState,
   4129                      parameters.compress_threads,
   4130                      DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT),
   4131    DEFINE_PROP_BOOL("x-compress-wait-thread", MigrationState,
   4132                      parameters.compress_wait_thread, true),
   4133    DEFINE_PROP_UINT8("x-decompress-threads", MigrationState,
   4134                      parameters.decompress_threads,
   4135                      DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT),
   4136    DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState,
   4137                      parameters.throttle_trigger_threshold,
   4138                      DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD),
   4139    DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState,
   4140                      parameters.cpu_throttle_initial,
   4141                      DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL),
   4142    DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState,
   4143                      parameters.cpu_throttle_increment,
   4144                      DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT),
   4145    DEFINE_PROP_BOOL("x-cpu-throttle-tailslow", MigrationState,
   4146                      parameters.cpu_throttle_tailslow, false),
   4147    DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState,
   4148                      parameters.max_bandwidth, MAX_THROTTLE),
   4149    DEFINE_PROP_UINT64("x-downtime-limit", MigrationState,
   4150                      parameters.downtime_limit,
   4151                      DEFAULT_MIGRATE_SET_DOWNTIME),
   4152    DEFINE_PROP_UINT32("x-checkpoint-delay", MigrationState,
   4153                      parameters.x_checkpoint_delay,
   4154                      DEFAULT_MIGRATE_X_CHECKPOINT_DELAY),
   4155    DEFINE_PROP_UINT8("multifd-channels", MigrationState,
   4156                      parameters.multifd_channels,
   4157                      DEFAULT_MIGRATE_MULTIFD_CHANNELS),
   4158    DEFINE_PROP_MULTIFD_COMPRESSION("multifd-compression", MigrationState,
   4159                      parameters.multifd_compression,
   4160                      DEFAULT_MIGRATE_MULTIFD_COMPRESSION),
   4161    DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState,
   4162                      parameters.multifd_zlib_level,
   4163                      DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL),
   4164    DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState,
   4165                      parameters.multifd_zstd_level,
   4166                      DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL),
   4167    DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState,
   4168                      parameters.xbzrle_cache_size,
   4169                      DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE),
   4170    DEFINE_PROP_SIZE("max-postcopy-bandwidth", MigrationState,
   4171                      parameters.max_postcopy_bandwidth,
   4172                      DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH),
   4173    DEFINE_PROP_UINT8("max-cpu-throttle", MigrationState,
   4174                      parameters.max_cpu_throttle,
   4175                      DEFAULT_MIGRATE_MAX_CPU_THROTTLE),
   4176    DEFINE_PROP_SIZE("announce-initial", MigrationState,
   4177                      parameters.announce_initial,
   4178                      DEFAULT_MIGRATE_ANNOUNCE_INITIAL),
   4179    DEFINE_PROP_SIZE("announce-max", MigrationState,
   4180                      parameters.announce_max,
   4181                      DEFAULT_MIGRATE_ANNOUNCE_MAX),
   4182    DEFINE_PROP_SIZE("announce-rounds", MigrationState,
   4183                      parameters.announce_rounds,
   4184                      DEFAULT_MIGRATE_ANNOUNCE_ROUNDS),
   4185    DEFINE_PROP_SIZE("announce-step", MigrationState,
   4186                      parameters.announce_step,
   4187                      DEFAULT_MIGRATE_ANNOUNCE_STEP),
   4188
   4189    /* Migration capabilities */
   4190    DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
   4191    DEFINE_PROP_MIG_CAP("x-rdma-pin-all", MIGRATION_CAPABILITY_RDMA_PIN_ALL),
   4192    DEFINE_PROP_MIG_CAP("x-auto-converge", MIGRATION_CAPABILITY_AUTO_CONVERGE),
   4193    DEFINE_PROP_MIG_CAP("x-zero-blocks", MIGRATION_CAPABILITY_ZERO_BLOCKS),
   4194    DEFINE_PROP_MIG_CAP("x-compress", MIGRATION_CAPABILITY_COMPRESS),
   4195    DEFINE_PROP_MIG_CAP("x-events", MIGRATION_CAPABILITY_EVENTS),
   4196    DEFINE_PROP_MIG_CAP("x-postcopy-ram", MIGRATION_CAPABILITY_POSTCOPY_RAM),
   4197    DEFINE_PROP_MIG_CAP("x-colo", MIGRATION_CAPABILITY_X_COLO),
   4198    DEFINE_PROP_MIG_CAP("x-release-ram", MIGRATION_CAPABILITY_RELEASE_RAM),
   4199    DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK),
   4200    DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH),
   4201    DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD),
   4202    DEFINE_PROP_MIG_CAP("x-background-snapshot",
   4203            MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT),
   4204
   4205    DEFINE_PROP_END_OF_LIST(),
   4206};
   4207
   4208static void migration_class_init(ObjectClass *klass, void *data)
   4209{
   4210    DeviceClass *dc = DEVICE_CLASS(klass);
   4211
   4212    dc->user_creatable = false;
   4213    device_class_set_props(dc, migration_properties);
   4214}
   4215
   4216static void migration_instance_finalize(Object *obj)
   4217{
   4218    MigrationState *ms = MIGRATION_OBJ(obj);
   4219    MigrationParameters *params = &ms->parameters;
   4220
   4221    qemu_mutex_destroy(&ms->error_mutex);
   4222    qemu_mutex_destroy(&ms->qemu_file_lock);
   4223    g_free(params->tls_hostname);
   4224    g_free(params->tls_creds);
   4225    qemu_sem_destroy(&ms->wait_unplug_sem);
   4226    qemu_sem_destroy(&ms->rate_limit_sem);
   4227    qemu_sem_destroy(&ms->pause_sem);
   4228    qemu_sem_destroy(&ms->postcopy_pause_sem);
   4229    qemu_sem_destroy(&ms->postcopy_pause_rp_sem);
   4230    qemu_sem_destroy(&ms->rp_state.rp_sem);
   4231    error_free(ms->error);
   4232}
   4233
   4234static void migration_instance_init(Object *obj)
   4235{
   4236    MigrationState *ms = MIGRATION_OBJ(obj);
   4237    MigrationParameters *params = &ms->parameters;
   4238
   4239    ms->state = MIGRATION_STATUS_NONE;
   4240    ms->mbps = -1;
   4241    ms->pages_per_second = -1;
   4242    qemu_sem_init(&ms->pause_sem, 0);
   4243    qemu_mutex_init(&ms->error_mutex);
   4244
   4245    params->tls_hostname = g_strdup("");
   4246    params->tls_creds = g_strdup("");
   4247
   4248    /* Set has_* up only for parameter checks */
   4249    params->has_compress_level = true;
   4250    params->has_compress_threads = true;
   4251    params->has_decompress_threads = true;
   4252    params->has_throttle_trigger_threshold = true;
   4253    params->has_cpu_throttle_initial = true;
   4254    params->has_cpu_throttle_increment = true;
   4255    params->has_cpu_throttle_tailslow = true;
   4256    params->has_max_bandwidth = true;
   4257    params->has_downtime_limit = true;
   4258    params->has_x_checkpoint_delay = true;
   4259    params->has_block_incremental = true;
   4260    params->has_multifd_channels = true;
   4261    params->has_multifd_compression = true;
   4262    params->has_multifd_zlib_level = true;
   4263    params->has_multifd_zstd_level = true;
   4264    params->has_xbzrle_cache_size = true;
   4265    params->has_max_postcopy_bandwidth = true;
   4266    params->has_max_cpu_throttle = true;
   4267    params->has_announce_initial = true;
   4268    params->has_announce_max = true;
   4269    params->has_announce_rounds = true;
   4270    params->has_announce_step = true;
   4271
   4272    qemu_sem_init(&ms->postcopy_pause_sem, 0);
   4273    qemu_sem_init(&ms->postcopy_pause_rp_sem, 0);
   4274    qemu_sem_init(&ms->rp_state.rp_sem, 0);
   4275    qemu_sem_init(&ms->rate_limit_sem, 0);
   4276    qemu_sem_init(&ms->wait_unplug_sem, 0);
   4277    qemu_mutex_init(&ms->qemu_file_lock);
   4278}
   4279
   4280/*
   4281 * Return true if check pass, false otherwise. Error will be put
   4282 * inside errp if provided.
   4283 */
   4284static bool migration_object_check(MigrationState *ms, Error **errp)
   4285{
   4286    MigrationCapabilityStatusList *head = NULL;
   4287    /* Assuming all off */
   4288    bool cap_list[MIGRATION_CAPABILITY__MAX] = { 0 }, ret;
   4289    int i;
   4290
   4291    if (!migrate_params_check(&ms->parameters, errp)) {
   4292        return false;
   4293    }
   4294
   4295    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
   4296        if (ms->enabled_capabilities[i]) {
   4297            QAPI_LIST_PREPEND(head, migrate_cap_add(i, true));
   4298        }
   4299    }
   4300
   4301    ret = migrate_caps_check(cap_list, head, errp);
   4302
   4303    /* It works with head == NULL */
   4304    qapi_free_MigrationCapabilityStatusList(head);
   4305
   4306    return ret;
   4307}
   4308
   4309static const TypeInfo migration_type = {
   4310    .name = TYPE_MIGRATION,
   4311    /*
   4312     * NOTE: TYPE_MIGRATION is not really a device, as the object is
   4313     * not created using qdev_new(), it is not attached to the qdev
   4314     * device tree, and it is never realized.
   4315     *
   4316     * TODO: Make this TYPE_OBJECT once QOM provides something like
   4317     * TYPE_DEVICE's "-global" properties.
   4318     */
   4319    .parent = TYPE_DEVICE,
   4320    .class_init = migration_class_init,
   4321    .class_size = sizeof(MigrationClass),
   4322    .instance_size = sizeof(MigrationState),
   4323    .instance_init = migration_instance_init,
   4324    .instance_finalize = migration_instance_finalize,
   4325};
   4326
   4327static void register_migration_types(void)
   4328{
   4329    type_register_static(&migration_type);
   4330}
   4331
   4332type_init(register_migration_types);