cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

ram.c (128779B)


      1/*
      2 * QEMU System Emulator
      3 *
      4 * Copyright (c) 2003-2008 Fabrice Bellard
      5 * Copyright (c) 2011-2015 Red Hat Inc
      6 *
      7 * Authors:
      8 *  Juan Quintela <quintela@redhat.com>
      9 *
     10 * Permission is hereby granted, free of charge, to any person obtaining a copy
     11 * of this software and associated documentation files (the "Software"), to deal
     12 * in the Software without restriction, including without limitation the rights
     13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     14 * copies of the Software, and to permit persons to whom the Software is
     15 * furnished to do so, subject to the following conditions:
     16 *
     17 * The above copyright notice and this permission notice shall be included in
     18 * all copies or substantial portions of the Software.
     19 *
     20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     26 * THE SOFTWARE.
     27 */
     28
     29#include "qemu/osdep.h"
     30#include "qemu/cutils.h"
     31#include "qemu/bitops.h"
     32#include "qemu/bitmap.h"
     33#include "qemu/main-loop.h"
     34#include "xbzrle.h"
     35#include "ram.h"
     36#include "migration.h"
     37#include "migration/register.h"
     38#include "migration/misc.h"
     39#include "qemu-file.h"
     40#include "postcopy-ram.h"
     41#include "page_cache.h"
     42#include "qemu/error-report.h"
     43#include "qapi/error.h"
     44#include "qapi/qapi-types-migration.h"
     45#include "qapi/qapi-events-migration.h"
     46#include "qapi/qmp/qerror.h"
     47#include "trace.h"
     48#include "exec/ram_addr.h"
     49#include "exec/target_page.h"
     50#include "qemu/rcu_queue.h"
     51#include "migration/colo.h"
     52#include "block.h"
     53#include "sysemu/cpu-throttle.h"
     54#include "savevm.h"
     55#include "qemu/iov.h"
     56#include "multifd.h"
     57#include "sysemu/runstate.h"
     58
     59#if defined(__linux__)
     60#include "qemu/userfaultfd.h"
     61#endif /* defined(__linux__) */
     62
     63/***********************************************************/
     64/* ram save/restore */
     65
     66/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
     67 * worked for pages that where filled with the same char.  We switched
     68 * it to only search for the zero value.  And to avoid confusion with
     69 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
     70 */
     71
     72#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
     73#define RAM_SAVE_FLAG_ZERO     0x02
     74#define RAM_SAVE_FLAG_MEM_SIZE 0x04
     75#define RAM_SAVE_FLAG_PAGE     0x08
     76#define RAM_SAVE_FLAG_EOS      0x10
     77#define RAM_SAVE_FLAG_CONTINUE 0x20
     78#define RAM_SAVE_FLAG_XBZRLE   0x40
     79/* 0x80 is reserved in migration.h start with 0x100 next */
     80#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
     81
     82static inline bool is_zero_range(uint8_t *p, uint64_t size)
     83{
     84    return buffer_is_zero(p, size);
     85}
     86
     87XBZRLECacheStats xbzrle_counters;
     88
     89/* struct contains XBZRLE cache and a static page
     90   used by the compression */
     91static struct {
     92    /* buffer used for XBZRLE encoding */
     93    uint8_t *encoded_buf;
     94    /* buffer for storing page content */
     95    uint8_t *current_buf;
     96    /* Cache for XBZRLE, Protected by lock. */
     97    PageCache *cache;
     98    QemuMutex lock;
     99    /* it will store a page full of zeros */
    100    uint8_t *zero_target_page;
    101    /* buffer used for XBZRLE decoding */
    102    uint8_t *decoded_buf;
    103} XBZRLE;
    104
    105static void XBZRLE_cache_lock(void)
    106{
    107    if (migrate_use_xbzrle()) {
    108        qemu_mutex_lock(&XBZRLE.lock);
    109    }
    110}
    111
    112static void XBZRLE_cache_unlock(void)
    113{
    114    if (migrate_use_xbzrle()) {
    115        qemu_mutex_unlock(&XBZRLE.lock);
    116    }
    117}
    118
    119/**
    120 * xbzrle_cache_resize: resize the xbzrle cache
    121 *
    122 * This function is called from migrate_params_apply in main
    123 * thread, possibly while a migration is in progress.  A running
    124 * migration may be using the cache and might finish during this call,
    125 * hence changes to the cache are protected by XBZRLE.lock().
    126 *
    127 * Returns 0 for success or -1 for error
    128 *
    129 * @new_size: new cache size
    130 * @errp: set *errp if the check failed, with reason
    131 */
    132int xbzrle_cache_resize(uint64_t new_size, Error **errp)
    133{
    134    PageCache *new_cache;
    135    int64_t ret = 0;
    136
    137    /* Check for truncation */
    138    if (new_size != (size_t)new_size) {
    139        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
    140                   "exceeding address space");
    141        return -1;
    142    }
    143
    144    if (new_size == migrate_xbzrle_cache_size()) {
    145        /* nothing to do */
    146        return 0;
    147    }
    148
    149    XBZRLE_cache_lock();
    150
    151    if (XBZRLE.cache != NULL) {
    152        new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
    153        if (!new_cache) {
    154            ret = -1;
    155            goto out;
    156        }
    157
    158        cache_fini(XBZRLE.cache);
    159        XBZRLE.cache = new_cache;
    160    }
    161out:
    162    XBZRLE_cache_unlock();
    163    return ret;
    164}
    165
    166bool ramblock_is_ignored(RAMBlock *block)
    167{
    168    return !qemu_ram_is_migratable(block) ||
    169           (migrate_ignore_shared() && qemu_ram_is_shared(block));
    170}
    171
    172#undef RAMBLOCK_FOREACH
    173
    174int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
    175{
    176    RAMBlock *block;
    177    int ret = 0;
    178
    179    RCU_READ_LOCK_GUARD();
    180
    181    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
    182        ret = func(block, opaque);
    183        if (ret) {
    184            break;
    185        }
    186    }
    187    return ret;
    188}
    189
    190static void ramblock_recv_map_init(void)
    191{
    192    RAMBlock *rb;
    193
    194    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
    195        assert(!rb->receivedmap);
    196        rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
    197    }
    198}
    199
    200int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
    201{
    202    return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
    203                    rb->receivedmap);
    204}
    205
    206bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
    207{
    208    return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
    209}
    210
    211void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
    212{
    213    set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
    214}
    215
    216void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
    217                                    size_t nr)
    218{
    219    bitmap_set_atomic(rb->receivedmap,
    220                      ramblock_recv_bitmap_offset(host_addr, rb),
    221                      nr);
    222}
    223
    224#define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
    225
    226/*
    227 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
    228 *
    229 * Returns >0 if success with sent bytes, or <0 if error.
    230 */
    231int64_t ramblock_recv_bitmap_send(QEMUFile *file,
    232                                  const char *block_name)
    233{
    234    RAMBlock *block = qemu_ram_block_by_name(block_name);
    235    unsigned long *le_bitmap, nbits;
    236    uint64_t size;
    237
    238    if (!block) {
    239        error_report("%s: invalid block name: %s", __func__, block_name);
    240        return -1;
    241    }
    242
    243    nbits = block->postcopy_length >> TARGET_PAGE_BITS;
    244
    245    /*
    246     * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
    247     * machines we may need 4 more bytes for padding (see below
    248     * comment). So extend it a bit before hand.
    249     */
    250    le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
    251
    252    /*
    253     * Always use little endian when sending the bitmap. This is
    254     * required that when source and destination VMs are not using the
    255     * same endianness. (Note: big endian won't work.)
    256     */
    257    bitmap_to_le(le_bitmap, block->receivedmap, nbits);
    258
    259    /* Size of the bitmap, in bytes */
    260    size = DIV_ROUND_UP(nbits, 8);
    261
    262    /*
    263     * size is always aligned to 8 bytes for 64bit machines, but it
    264     * may not be true for 32bit machines. We need this padding to
    265     * make sure the migration can survive even between 32bit and
    266     * 64bit machines.
    267     */
    268    size = ROUND_UP(size, 8);
    269
    270    qemu_put_be64(file, size);
    271    qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
    272    /*
    273     * Mark as an end, in case the middle part is screwed up due to
    274     * some "mysterious" reason.
    275     */
    276    qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
    277    qemu_fflush(file);
    278
    279    g_free(le_bitmap);
    280
    281    if (qemu_file_get_error(file)) {
    282        return qemu_file_get_error(file);
    283    }
    284
    285    return size + sizeof(size);
    286}
    287
    288/*
    289 * An outstanding page request, on the source, having been received
    290 * and queued
    291 */
    292struct RAMSrcPageRequest {
    293    RAMBlock *rb;
    294    hwaddr    offset;
    295    hwaddr    len;
    296
    297    QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
    298};
    299
    300/* State of RAM for migration */
    301struct RAMState {
    302    /* QEMUFile used for this migration */
    303    QEMUFile *f;
    304    /* UFFD file descriptor, used in 'write-tracking' migration */
    305    int uffdio_fd;
    306    /* Last block that we have visited searching for dirty pages */
    307    RAMBlock *last_seen_block;
    308    /* Last block from where we have sent data */
    309    RAMBlock *last_sent_block;
    310    /* Last dirty target page we have sent */
    311    ram_addr_t last_page;
    312    /* last ram version we have seen */
    313    uint32_t last_version;
    314    /* How many times we have dirty too many pages */
    315    int dirty_rate_high_cnt;
    316    /* these variables are used for bitmap sync */
    317    /* last time we did a full bitmap_sync */
    318    int64_t time_last_bitmap_sync;
    319    /* bytes transferred at start_time */
    320    uint64_t bytes_xfer_prev;
    321    /* number of dirty pages since start_time */
    322    uint64_t num_dirty_pages_period;
    323    /* xbzrle misses since the beginning of the period */
    324    uint64_t xbzrle_cache_miss_prev;
    325    /* Amount of xbzrle pages since the beginning of the period */
    326    uint64_t xbzrle_pages_prev;
    327    /* Amount of xbzrle encoded bytes since the beginning of the period */
    328    uint64_t xbzrle_bytes_prev;
    329    /* Start using XBZRLE (e.g., after the first round). */
    330    bool xbzrle_enabled;
    331
    332    /* compression statistics since the beginning of the period */
    333    /* amount of count that no free thread to compress data */
    334    uint64_t compress_thread_busy_prev;
    335    /* amount bytes after compression */
    336    uint64_t compressed_size_prev;
    337    /* amount of compressed pages */
    338    uint64_t compress_pages_prev;
    339
    340    /* total handled target pages at the beginning of period */
    341    uint64_t target_page_count_prev;
    342    /* total handled target pages since start */
    343    uint64_t target_page_count;
    344    /* number of dirty bits in the bitmap */
    345    uint64_t migration_dirty_pages;
    346    /* Protects modification of the bitmap and migration dirty pages */
    347    QemuMutex bitmap_mutex;
    348    /* The RAMBlock used in the last src_page_requests */
    349    RAMBlock *last_req_rb;
    350    /* Queue of outstanding page requests from the destination */
    351    QemuMutex src_page_req_mutex;
    352    QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
    353};
    354typedef struct RAMState RAMState;
    355
    356static RAMState *ram_state;
    357
    358static NotifierWithReturnList precopy_notifier_list;
    359
    360void precopy_infrastructure_init(void)
    361{
    362    notifier_with_return_list_init(&precopy_notifier_list);
    363}
    364
    365void precopy_add_notifier(NotifierWithReturn *n)
    366{
    367    notifier_with_return_list_add(&precopy_notifier_list, n);
    368}
    369
    370void precopy_remove_notifier(NotifierWithReturn *n)
    371{
    372    notifier_with_return_remove(n);
    373}
    374
    375int precopy_notify(PrecopyNotifyReason reason, Error **errp)
    376{
    377    PrecopyNotifyData pnd;
    378    pnd.reason = reason;
    379    pnd.errp = errp;
    380
    381    return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
    382}
    383
    384uint64_t ram_bytes_remaining(void)
    385{
    386    return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
    387                       0;
    388}
    389
    390MigrationStats ram_counters;
    391
    392/* used by the search for pages to send */
    393struct PageSearchStatus {
    394    /* Current block being searched */
    395    RAMBlock    *block;
    396    /* Current page to search from */
    397    unsigned long page;
    398    /* Set once we wrap around */
    399    bool         complete_round;
    400};
    401typedef struct PageSearchStatus PageSearchStatus;
    402
    403CompressionStats compression_counters;
    404
    405struct CompressParam {
    406    bool done;
    407    bool quit;
    408    bool zero_page;
    409    QEMUFile *file;
    410    QemuMutex mutex;
    411    QemuCond cond;
    412    RAMBlock *block;
    413    ram_addr_t offset;
    414
    415    /* internally used fields */
    416    z_stream stream;
    417    uint8_t *originbuf;
    418};
    419typedef struct CompressParam CompressParam;
    420
    421struct DecompressParam {
    422    bool done;
    423    bool quit;
    424    QemuMutex mutex;
    425    QemuCond cond;
    426    void *des;
    427    uint8_t *compbuf;
    428    int len;
    429    z_stream stream;
    430};
    431typedef struct DecompressParam DecompressParam;
    432
    433static CompressParam *comp_param;
    434static QemuThread *compress_threads;
    435/* comp_done_cond is used to wake up the migration thread when
    436 * one of the compression threads has finished the compression.
    437 * comp_done_lock is used to co-work with comp_done_cond.
    438 */
    439static QemuMutex comp_done_lock;
    440static QemuCond comp_done_cond;
    441/* The empty QEMUFileOps will be used by file in CompressParam */
    442static const QEMUFileOps empty_ops = { };
    443
    444static QEMUFile *decomp_file;
    445static DecompressParam *decomp_param;
    446static QemuThread *decompress_threads;
    447static QemuMutex decomp_done_lock;
    448static QemuCond decomp_done_cond;
    449
    450static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
    451                                 ram_addr_t offset, uint8_t *source_buf);
    452
    453static void *do_data_compress(void *opaque)
    454{
    455    CompressParam *param = opaque;
    456    RAMBlock *block;
    457    ram_addr_t offset;
    458    bool zero_page;
    459
    460    qemu_mutex_lock(&param->mutex);
    461    while (!param->quit) {
    462        if (param->block) {
    463            block = param->block;
    464            offset = param->offset;
    465            param->block = NULL;
    466            qemu_mutex_unlock(&param->mutex);
    467
    468            zero_page = do_compress_ram_page(param->file, &param->stream,
    469                                             block, offset, param->originbuf);
    470
    471            qemu_mutex_lock(&comp_done_lock);
    472            param->done = true;
    473            param->zero_page = zero_page;
    474            qemu_cond_signal(&comp_done_cond);
    475            qemu_mutex_unlock(&comp_done_lock);
    476
    477            qemu_mutex_lock(&param->mutex);
    478        } else {
    479            qemu_cond_wait(&param->cond, &param->mutex);
    480        }
    481    }
    482    qemu_mutex_unlock(&param->mutex);
    483
    484    return NULL;
    485}
    486
    487static void compress_threads_save_cleanup(void)
    488{
    489    int i, thread_count;
    490
    491    if (!migrate_use_compression() || !comp_param) {
    492        return;
    493    }
    494
    495    thread_count = migrate_compress_threads();
    496    for (i = 0; i < thread_count; i++) {
    497        /*
    498         * we use it as a indicator which shows if the thread is
    499         * properly init'd or not
    500         */
    501        if (!comp_param[i].file) {
    502            break;
    503        }
    504
    505        qemu_mutex_lock(&comp_param[i].mutex);
    506        comp_param[i].quit = true;
    507        qemu_cond_signal(&comp_param[i].cond);
    508        qemu_mutex_unlock(&comp_param[i].mutex);
    509
    510        qemu_thread_join(compress_threads + i);
    511        qemu_mutex_destroy(&comp_param[i].mutex);
    512        qemu_cond_destroy(&comp_param[i].cond);
    513        deflateEnd(&comp_param[i].stream);
    514        g_free(comp_param[i].originbuf);
    515        qemu_fclose(comp_param[i].file);
    516        comp_param[i].file = NULL;
    517    }
    518    qemu_mutex_destroy(&comp_done_lock);
    519    qemu_cond_destroy(&comp_done_cond);
    520    g_free(compress_threads);
    521    g_free(comp_param);
    522    compress_threads = NULL;
    523    comp_param = NULL;
    524}
    525
    526static int compress_threads_save_setup(void)
    527{
    528    int i, thread_count;
    529
    530    if (!migrate_use_compression()) {
    531        return 0;
    532    }
    533    thread_count = migrate_compress_threads();
    534    compress_threads = g_new0(QemuThread, thread_count);
    535    comp_param = g_new0(CompressParam, thread_count);
    536    qemu_cond_init(&comp_done_cond);
    537    qemu_mutex_init(&comp_done_lock);
    538    for (i = 0; i < thread_count; i++) {
    539        comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
    540        if (!comp_param[i].originbuf) {
    541            goto exit;
    542        }
    543
    544        if (deflateInit(&comp_param[i].stream,
    545                        migrate_compress_level()) != Z_OK) {
    546            g_free(comp_param[i].originbuf);
    547            goto exit;
    548        }
    549
    550        /* comp_param[i].file is just used as a dummy buffer to save data,
    551         * set its ops to empty.
    552         */
    553        comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
    554        comp_param[i].done = true;
    555        comp_param[i].quit = false;
    556        qemu_mutex_init(&comp_param[i].mutex);
    557        qemu_cond_init(&comp_param[i].cond);
    558        qemu_thread_create(compress_threads + i, "compress",
    559                           do_data_compress, comp_param + i,
    560                           QEMU_THREAD_JOINABLE);
    561    }
    562    return 0;
    563
    564exit:
    565    compress_threads_save_cleanup();
    566    return -1;
    567}
    568
    569/**
    570 * save_page_header: write page header to wire
    571 *
    572 * If this is the 1st block, it also writes the block identification
    573 *
    574 * Returns the number of bytes written
    575 *
    576 * @f: QEMUFile where to send the data
    577 * @block: block that contains the page we want to send
    578 * @offset: offset inside the block for the page
    579 *          in the lower bits, it contains flags
    580 */
    581static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
    582                               ram_addr_t offset)
    583{
    584    size_t size, len;
    585
    586    if (block == rs->last_sent_block) {
    587        offset |= RAM_SAVE_FLAG_CONTINUE;
    588    }
    589    qemu_put_be64(f, offset);
    590    size = 8;
    591
    592    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
    593        len = strlen(block->idstr);
    594        qemu_put_byte(f, len);
    595        qemu_put_buffer(f, (uint8_t *)block->idstr, len);
    596        size += 1 + len;
    597        rs->last_sent_block = block;
    598    }
    599    return size;
    600}
    601
    602/**
    603 * mig_throttle_guest_down: throttle down the guest
    604 *
    605 * Reduce amount of guest cpu execution to hopefully slow down memory
    606 * writes. If guest dirty memory rate is reduced below the rate at
    607 * which we can transfer pages to the destination then we should be
    608 * able to complete migration. Some workloads dirty memory way too
    609 * fast and will not effectively converge, even with auto-converge.
    610 */
    611static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
    612                                    uint64_t bytes_dirty_threshold)
    613{
    614    MigrationState *s = migrate_get_current();
    615    uint64_t pct_initial = s->parameters.cpu_throttle_initial;
    616    uint64_t pct_increment = s->parameters.cpu_throttle_increment;
    617    bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
    618    int pct_max = s->parameters.max_cpu_throttle;
    619
    620    uint64_t throttle_now = cpu_throttle_get_percentage();
    621    uint64_t cpu_now, cpu_ideal, throttle_inc;
    622
    623    /* We have not started throttling yet. Let's start it. */
    624    if (!cpu_throttle_active()) {
    625        cpu_throttle_set(pct_initial);
    626    } else {
    627        /* Throttling already on, just increase the rate */
    628        if (!pct_tailslow) {
    629            throttle_inc = pct_increment;
    630        } else {
    631            /* Compute the ideal CPU percentage used by Guest, which may
    632             * make the dirty rate match the dirty rate threshold. */
    633            cpu_now = 100 - throttle_now;
    634            cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
    635                        bytes_dirty_period);
    636            throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
    637        }
    638        cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
    639    }
    640}
    641
    642/**
    643 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
    644 *
    645 * @rs: current RAM state
    646 * @current_addr: address for the zero page
    647 *
    648 * Update the xbzrle cache to reflect a page that's been sent as all 0.
    649 * The important thing is that a stale (not-yet-0'd) page be replaced
    650 * by the new data.
    651 * As a bonus, if the page wasn't in the cache it gets added so that
    652 * when a small write is made into the 0'd page it gets XBZRLE sent.
    653 */
    654static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
    655{
    656    if (!rs->xbzrle_enabled) {
    657        return;
    658    }
    659
    660    /* We don't care if this fails to allocate a new cache page
    661     * as long as it updated an old one */
    662    cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
    663                 ram_counters.dirty_sync_count);
    664}
    665
    666#define ENCODING_FLAG_XBZRLE 0x1
    667
    668/**
    669 * save_xbzrle_page: compress and send current page
    670 *
    671 * Returns: 1 means that we wrote the page
    672 *          0 means that page is identical to the one already sent
    673 *          -1 means that xbzrle would be longer than normal
    674 *
    675 * @rs: current RAM state
    676 * @current_data: pointer to the address of the page contents
    677 * @current_addr: addr of the page
    678 * @block: block that contains the page we want to send
    679 * @offset: offset inside the block for the page
    680 * @last_stage: if we are at the completion stage
    681 */
    682static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
    683                            ram_addr_t current_addr, RAMBlock *block,
    684                            ram_addr_t offset, bool last_stage)
    685{
    686    int encoded_len = 0, bytes_xbzrle;
    687    uint8_t *prev_cached_page;
    688
    689    if (!cache_is_cached(XBZRLE.cache, current_addr,
    690                         ram_counters.dirty_sync_count)) {
    691        xbzrle_counters.cache_miss++;
    692        if (!last_stage) {
    693            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
    694                             ram_counters.dirty_sync_count) == -1) {
    695                return -1;
    696            } else {
    697                /* update *current_data when the page has been
    698                   inserted into cache */
    699                *current_data = get_cached_data(XBZRLE.cache, current_addr);
    700            }
    701        }
    702        return -1;
    703    }
    704
    705    /*
    706     * Reaching here means the page has hit the xbzrle cache, no matter what
    707     * encoding result it is (normal encoding, overflow or skipping the page),
    708     * count the page as encoded. This is used to calculate the encoding rate.
    709     *
    710     * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
    711     * 2nd page turns out to be skipped (i.e. no new bytes written to the
    712     * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
    713     * skipped page included. In this way, the encoding rate can tell if the
    714     * guest page is good for xbzrle encoding.
    715     */
    716    xbzrle_counters.pages++;
    717    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
    718
    719    /* save current buffer into memory */
    720    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
    721
    722    /* XBZRLE encoding (if there is no overflow) */
    723    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
    724                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
    725                                       TARGET_PAGE_SIZE);
    726
    727    /*
    728     * Update the cache contents, so that it corresponds to the data
    729     * sent, in all cases except where we skip the page.
    730     */
    731    if (!last_stage && encoded_len != 0) {
    732        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
    733        /*
    734         * In the case where we couldn't compress, ensure that the caller
    735         * sends the data from the cache, since the guest might have
    736         * changed the RAM since we copied it.
    737         */
    738        *current_data = prev_cached_page;
    739    }
    740
    741    if (encoded_len == 0) {
    742        trace_save_xbzrle_page_skipping();
    743        return 0;
    744    } else if (encoded_len == -1) {
    745        trace_save_xbzrle_page_overflow();
    746        xbzrle_counters.overflow++;
    747        xbzrle_counters.bytes += TARGET_PAGE_SIZE;
    748        return -1;
    749    }
    750
    751    /* Send XBZRLE based compressed page */
    752    bytes_xbzrle = save_page_header(rs, rs->f, block,
    753                                    offset | RAM_SAVE_FLAG_XBZRLE);
    754    qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
    755    qemu_put_be16(rs->f, encoded_len);
    756    qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
    757    bytes_xbzrle += encoded_len + 1 + 2;
    758    /*
    759     * Like compressed_size (please see update_compress_thread_counts),
    760     * the xbzrle encoded bytes don't count the 8 byte header with
    761     * RAM_SAVE_FLAG_CONTINUE.
    762     */
    763    xbzrle_counters.bytes += bytes_xbzrle - 8;
    764    ram_counters.transferred += bytes_xbzrle;
    765
    766    return 1;
    767}
    768
    769/**
    770 * migration_bitmap_find_dirty: find the next dirty page from start
    771 *
    772 * Returns the page offset within memory region of the start of a dirty page
    773 *
    774 * @rs: current RAM state
    775 * @rb: RAMBlock where to search for dirty pages
    776 * @start: page where we start the search
    777 */
    778static inline
    779unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
    780                                          unsigned long start)
    781{
    782    unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
    783    unsigned long *bitmap = rb->bmap;
    784
    785    if (ramblock_is_ignored(rb)) {
    786        return size;
    787    }
    788
    789    return find_next_bit(bitmap, size, start);
    790}
    791
    792static void migration_clear_memory_region_dirty_bitmap(RAMState *rs,
    793                                                       RAMBlock *rb,
    794                                                       unsigned long page)
    795{
    796    uint8_t shift;
    797    hwaddr size, start;
    798
    799    if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
    800        return;
    801    }
    802
    803    shift = rb->clear_bmap_shift;
    804    /*
    805     * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
    806     * can make things easier sometimes since then start address
    807     * of the small chunk will always be 64 pages aligned so the
    808     * bitmap will always be aligned to unsigned long. We should
    809     * even be able to remove this restriction but I'm simply
    810     * keeping it.
    811     */
    812    assert(shift >= 6);
    813
    814    size = 1ULL << (TARGET_PAGE_BITS + shift);
    815    start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
    816    trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
    817    memory_region_clear_dirty_bitmap(rb->mr, start, size);
    818}
    819
    820static void
    821migration_clear_memory_region_dirty_bitmap_range(RAMState *rs,
    822                                                 RAMBlock *rb,
    823                                                 unsigned long start,
    824                                                 unsigned long npages)
    825{
    826    unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
    827    unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
    828    unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
    829
    830    /*
    831     * Clear pages from start to start + npages - 1, so the end boundary is
    832     * exclusive.
    833     */
    834    for (i = chunk_start; i < chunk_end; i += chunk_pages) {
    835        migration_clear_memory_region_dirty_bitmap(rs, rb, i);
    836    }
    837}
    838
    839static inline bool migration_bitmap_clear_dirty(RAMState *rs,
    840                                                RAMBlock *rb,
    841                                                unsigned long page)
    842{
    843    bool ret;
    844
    845    /*
    846     * Clear dirty bitmap if needed.  This _must_ be called before we
    847     * send any of the page in the chunk because we need to make sure
    848     * we can capture further page content changes when we sync dirty
    849     * log the next time.  So as long as we are going to send any of
    850     * the page in the chunk we clear the remote dirty bitmap for all.
    851     * Clearing it earlier won't be a problem, but too late will.
    852     */
    853    migration_clear_memory_region_dirty_bitmap(rs, rb, page);
    854
    855    ret = test_and_clear_bit(page, rb->bmap);
    856    if (ret) {
    857        rs->migration_dirty_pages--;
    858    }
    859
    860    return ret;
    861}
    862
    863/* Called with RCU critical section */
    864static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
    865{
    866    uint64_t new_dirty_pages =
    867        cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
    868
    869    rs->migration_dirty_pages += new_dirty_pages;
    870    rs->num_dirty_pages_period += new_dirty_pages;
    871}
    872
    873/**
    874 * ram_pagesize_summary: calculate all the pagesizes of a VM
    875 *
    876 * Returns a summary bitmap of the page sizes of all RAMBlocks
    877 *
    878 * For VMs with just normal pages this is equivalent to the host page
    879 * size. If it's got some huge pages then it's the OR of all the
    880 * different page sizes.
    881 */
    882uint64_t ram_pagesize_summary(void)
    883{
    884    RAMBlock *block;
    885    uint64_t summary = 0;
    886
    887    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
    888        summary |= block->page_size;
    889    }
    890
    891    return summary;
    892}
    893
    894uint64_t ram_get_total_transferred_pages(void)
    895{
    896    return  ram_counters.normal + ram_counters.duplicate +
    897                compression_counters.pages + xbzrle_counters.pages;
    898}
    899
    900static void migration_update_rates(RAMState *rs, int64_t end_time)
    901{
    902    uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
    903    double compressed_size;
    904
    905    /* calculate period counters */
    906    ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
    907                / (end_time - rs->time_last_bitmap_sync);
    908
    909    if (!page_count) {
    910        return;
    911    }
    912
    913    if (migrate_use_xbzrle()) {
    914        double encoded_size, unencoded_size;
    915
    916        xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
    917            rs->xbzrle_cache_miss_prev) / page_count;
    918        rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
    919        unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
    920                         TARGET_PAGE_SIZE;
    921        encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
    922        if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
    923            xbzrle_counters.encoding_rate = 0;
    924        } else {
    925            xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
    926        }
    927        rs->xbzrle_pages_prev = xbzrle_counters.pages;
    928        rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
    929    }
    930
    931    if (migrate_use_compression()) {
    932        compression_counters.busy_rate = (double)(compression_counters.busy -
    933            rs->compress_thread_busy_prev) / page_count;
    934        rs->compress_thread_busy_prev = compression_counters.busy;
    935
    936        compressed_size = compression_counters.compressed_size -
    937                          rs->compressed_size_prev;
    938        if (compressed_size) {
    939            double uncompressed_size = (compression_counters.pages -
    940                                    rs->compress_pages_prev) * TARGET_PAGE_SIZE;
    941
    942            /* Compression-Ratio = Uncompressed-size / Compressed-size */
    943            compression_counters.compression_rate =
    944                                        uncompressed_size / compressed_size;
    945
    946            rs->compress_pages_prev = compression_counters.pages;
    947            rs->compressed_size_prev = compression_counters.compressed_size;
    948        }
    949    }
    950}
    951
    952static void migration_trigger_throttle(RAMState *rs)
    953{
    954    MigrationState *s = migrate_get_current();
    955    uint64_t threshold = s->parameters.throttle_trigger_threshold;
    956
    957    uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
    958    uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
    959    uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
    960
    961    /* During block migration the auto-converge logic incorrectly detects
    962     * that ram migration makes no progress. Avoid this by disabling the
    963     * throttling logic during the bulk phase of block migration. */
    964    if (migrate_auto_converge() && !blk_mig_bulk_active()) {
    965        /* The following detection logic can be refined later. For now:
    966           Check to see if the ratio between dirtied bytes and the approx.
    967           amount of bytes that just got transferred since the last time
    968           we were in this routine reaches the threshold. If that happens
    969           twice, start or increase throttling. */
    970
    971        if ((bytes_dirty_period > bytes_dirty_threshold) &&
    972            (++rs->dirty_rate_high_cnt >= 2)) {
    973            trace_migration_throttle();
    974            rs->dirty_rate_high_cnt = 0;
    975            mig_throttle_guest_down(bytes_dirty_period,
    976                                    bytes_dirty_threshold);
    977        }
    978    }
    979}
    980
    981static void migration_bitmap_sync(RAMState *rs)
    982{
    983    RAMBlock *block;
    984    int64_t end_time;
    985
    986    ram_counters.dirty_sync_count++;
    987
    988    if (!rs->time_last_bitmap_sync) {
    989        rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
    990    }
    991
    992    trace_migration_bitmap_sync_start();
    993    memory_global_dirty_log_sync();
    994
    995    qemu_mutex_lock(&rs->bitmap_mutex);
    996    WITH_RCU_READ_LOCK_GUARD() {
    997        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
    998            ramblock_sync_dirty_bitmap(rs, block);
    999        }
   1000        ram_counters.remaining = ram_bytes_remaining();
   1001    }
   1002    qemu_mutex_unlock(&rs->bitmap_mutex);
   1003
   1004    memory_global_after_dirty_log_sync();
   1005    trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
   1006
   1007    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
   1008
   1009    /* more than 1 second = 1000 millisecons */
   1010    if (end_time > rs->time_last_bitmap_sync + 1000) {
   1011        migration_trigger_throttle(rs);
   1012
   1013        migration_update_rates(rs, end_time);
   1014
   1015        rs->target_page_count_prev = rs->target_page_count;
   1016
   1017        /* reset period counters */
   1018        rs->time_last_bitmap_sync = end_time;
   1019        rs->num_dirty_pages_period = 0;
   1020        rs->bytes_xfer_prev = ram_counters.transferred;
   1021    }
   1022    if (migrate_use_events()) {
   1023        qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
   1024    }
   1025}
   1026
   1027static void migration_bitmap_sync_precopy(RAMState *rs)
   1028{
   1029    Error *local_err = NULL;
   1030
   1031    /*
   1032     * The current notifier usage is just an optimization to migration, so we
   1033     * don't stop the normal migration process in the error case.
   1034     */
   1035    if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
   1036        error_report_err(local_err);
   1037        local_err = NULL;
   1038    }
   1039
   1040    migration_bitmap_sync(rs);
   1041
   1042    if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
   1043        error_report_err(local_err);
   1044    }
   1045}
   1046
   1047/**
   1048 * save_zero_page_to_file: send the zero page to the file
   1049 *
   1050 * Returns the size of data written to the file, 0 means the page is not
   1051 * a zero page
   1052 *
   1053 * @rs: current RAM state
   1054 * @file: the file where the data is saved
   1055 * @block: block that contains the page we want to send
   1056 * @offset: offset inside the block for the page
   1057 */
   1058static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
   1059                                  RAMBlock *block, ram_addr_t offset)
   1060{
   1061    uint8_t *p = block->host + offset;
   1062    int len = 0;
   1063
   1064    if (is_zero_range(p, TARGET_PAGE_SIZE)) {
   1065        len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
   1066        qemu_put_byte(file, 0);
   1067        len += 1;
   1068    }
   1069    return len;
   1070}
   1071
   1072/**
   1073 * save_zero_page: send the zero page to the stream
   1074 *
   1075 * Returns the number of pages written.
   1076 *
   1077 * @rs: current RAM state
   1078 * @block: block that contains the page we want to send
   1079 * @offset: offset inside the block for the page
   1080 */
   1081static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
   1082{
   1083    int len = save_zero_page_to_file(rs, rs->f, block, offset);
   1084
   1085    if (len) {
   1086        ram_counters.duplicate++;
   1087        ram_counters.transferred += len;
   1088        return 1;
   1089    }
   1090    return -1;
   1091}
   1092
   1093static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
   1094{
   1095    if (!migrate_release_ram() || !migration_in_postcopy()) {
   1096        return;
   1097    }
   1098
   1099    ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
   1100}
   1101
   1102/*
   1103 * @pages: the number of pages written by the control path,
   1104 *        < 0 - error
   1105 *        > 0 - number of pages written
   1106 *
   1107 * Return true if the pages has been saved, otherwise false is returned.
   1108 */
   1109static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
   1110                              int *pages)
   1111{
   1112    uint64_t bytes_xmit = 0;
   1113    int ret;
   1114
   1115    *pages = -1;
   1116    ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
   1117                                &bytes_xmit);
   1118    if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
   1119        return false;
   1120    }
   1121
   1122    if (bytes_xmit) {
   1123        ram_counters.transferred += bytes_xmit;
   1124        *pages = 1;
   1125    }
   1126
   1127    if (ret == RAM_SAVE_CONTROL_DELAYED) {
   1128        return true;
   1129    }
   1130
   1131    if (bytes_xmit > 0) {
   1132        ram_counters.normal++;
   1133    } else if (bytes_xmit == 0) {
   1134        ram_counters.duplicate++;
   1135    }
   1136
   1137    return true;
   1138}
   1139
   1140/*
   1141 * directly send the page to the stream
   1142 *
   1143 * Returns the number of pages written.
   1144 *
   1145 * @rs: current RAM state
   1146 * @block: block that contains the page we want to send
   1147 * @offset: offset inside the block for the page
   1148 * @buf: the page to be sent
   1149 * @async: send to page asyncly
   1150 */
   1151static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
   1152                            uint8_t *buf, bool async)
   1153{
   1154    ram_counters.transferred += save_page_header(rs, rs->f, block,
   1155                                                 offset | RAM_SAVE_FLAG_PAGE);
   1156    if (async) {
   1157        qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
   1158                              migrate_release_ram() &
   1159                              migration_in_postcopy());
   1160    } else {
   1161        qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
   1162    }
   1163    ram_counters.transferred += TARGET_PAGE_SIZE;
   1164    ram_counters.normal++;
   1165    return 1;
   1166}
   1167
   1168/**
   1169 * ram_save_page: send the given page to the stream
   1170 *
   1171 * Returns the number of pages written.
   1172 *          < 0 - error
   1173 *          >=0 - Number of pages written - this might legally be 0
   1174 *                if xbzrle noticed the page was the same.
   1175 *
   1176 * @rs: current RAM state
   1177 * @block: block that contains the page we want to send
   1178 * @offset: offset inside the block for the page
   1179 * @last_stage: if we are at the completion stage
   1180 */
   1181static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
   1182{
   1183    int pages = -1;
   1184    uint8_t *p;
   1185    bool send_async = true;
   1186    RAMBlock *block = pss->block;
   1187    ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
   1188    ram_addr_t current_addr = block->offset + offset;
   1189
   1190    p = block->host + offset;
   1191    trace_ram_save_page(block->idstr, (uint64_t)offset, p);
   1192
   1193    XBZRLE_cache_lock();
   1194    if (rs->xbzrle_enabled && !migration_in_postcopy()) {
   1195        pages = save_xbzrle_page(rs, &p, current_addr, block,
   1196                                 offset, last_stage);
   1197        if (!last_stage) {
   1198            /* Can't send this cached data async, since the cache page
   1199             * might get updated before it gets to the wire
   1200             */
   1201            send_async = false;
   1202        }
   1203    }
   1204
   1205    /* XBZRLE overflow or normal page */
   1206    if (pages == -1) {
   1207        pages = save_normal_page(rs, block, offset, p, send_async);
   1208    }
   1209
   1210    XBZRLE_cache_unlock();
   1211
   1212    return pages;
   1213}
   1214
   1215static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
   1216                                 ram_addr_t offset)
   1217{
   1218    if (multifd_queue_page(rs->f, block, offset) < 0) {
   1219        return -1;
   1220    }
   1221    ram_counters.normal++;
   1222
   1223    return 1;
   1224}
   1225
   1226static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
   1227                                 ram_addr_t offset, uint8_t *source_buf)
   1228{
   1229    RAMState *rs = ram_state;
   1230    uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
   1231    bool zero_page = false;
   1232    int ret;
   1233
   1234    if (save_zero_page_to_file(rs, f, block, offset)) {
   1235        zero_page = true;
   1236        goto exit;
   1237    }
   1238
   1239    save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
   1240
   1241    /*
   1242     * copy it to a internal buffer to avoid it being modified by VM
   1243     * so that we can catch up the error during compression and
   1244     * decompression
   1245     */
   1246    memcpy(source_buf, p, TARGET_PAGE_SIZE);
   1247    ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
   1248    if (ret < 0) {
   1249        qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
   1250        error_report("compressed data failed!");
   1251        return false;
   1252    }
   1253
   1254exit:
   1255    ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
   1256    return zero_page;
   1257}
   1258
   1259static void
   1260update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
   1261{
   1262    ram_counters.transferred += bytes_xmit;
   1263
   1264    if (param->zero_page) {
   1265        ram_counters.duplicate++;
   1266        return;
   1267    }
   1268
   1269    /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
   1270    compression_counters.compressed_size += bytes_xmit - 8;
   1271    compression_counters.pages++;
   1272}
   1273
   1274static bool save_page_use_compression(RAMState *rs);
   1275
   1276static void flush_compressed_data(RAMState *rs)
   1277{
   1278    int idx, len, thread_count;
   1279
   1280    if (!save_page_use_compression(rs)) {
   1281        return;
   1282    }
   1283    thread_count = migrate_compress_threads();
   1284
   1285    qemu_mutex_lock(&comp_done_lock);
   1286    for (idx = 0; idx < thread_count; idx++) {
   1287        while (!comp_param[idx].done) {
   1288            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
   1289        }
   1290    }
   1291    qemu_mutex_unlock(&comp_done_lock);
   1292
   1293    for (idx = 0; idx < thread_count; idx++) {
   1294        qemu_mutex_lock(&comp_param[idx].mutex);
   1295        if (!comp_param[idx].quit) {
   1296            len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
   1297            /*
   1298             * it's safe to fetch zero_page without holding comp_done_lock
   1299             * as there is no further request submitted to the thread,
   1300             * i.e, the thread should be waiting for a request at this point.
   1301             */
   1302            update_compress_thread_counts(&comp_param[idx], len);
   1303        }
   1304        qemu_mutex_unlock(&comp_param[idx].mutex);
   1305    }
   1306}
   1307
   1308static inline void set_compress_params(CompressParam *param, RAMBlock *block,
   1309                                       ram_addr_t offset)
   1310{
   1311    param->block = block;
   1312    param->offset = offset;
   1313}
   1314
   1315static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
   1316                                           ram_addr_t offset)
   1317{
   1318    int idx, thread_count, bytes_xmit = -1, pages = -1;
   1319    bool wait = migrate_compress_wait_thread();
   1320
   1321    thread_count = migrate_compress_threads();
   1322    qemu_mutex_lock(&comp_done_lock);
   1323retry:
   1324    for (idx = 0; idx < thread_count; idx++) {
   1325        if (comp_param[idx].done) {
   1326            comp_param[idx].done = false;
   1327            bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
   1328            qemu_mutex_lock(&comp_param[idx].mutex);
   1329            set_compress_params(&comp_param[idx], block, offset);
   1330            qemu_cond_signal(&comp_param[idx].cond);
   1331            qemu_mutex_unlock(&comp_param[idx].mutex);
   1332            pages = 1;
   1333            update_compress_thread_counts(&comp_param[idx], bytes_xmit);
   1334            break;
   1335        }
   1336    }
   1337
   1338    /*
   1339     * wait for the free thread if the user specifies 'compress-wait-thread',
   1340     * otherwise we will post the page out in the main thread as normal page.
   1341     */
   1342    if (pages < 0 && wait) {
   1343        qemu_cond_wait(&comp_done_cond, &comp_done_lock);
   1344        goto retry;
   1345    }
   1346    qemu_mutex_unlock(&comp_done_lock);
   1347
   1348    return pages;
   1349}
   1350
   1351/**
   1352 * find_dirty_block: find the next dirty page and update any state
   1353 * associated with the search process.
   1354 *
   1355 * Returns true if a page is found
   1356 *
   1357 * @rs: current RAM state
   1358 * @pss: data about the state of the current dirty page scan
   1359 * @again: set to false if the search has scanned the whole of RAM
   1360 */
   1361static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
   1362{
   1363    pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
   1364    if (pss->complete_round && pss->block == rs->last_seen_block &&
   1365        pss->page >= rs->last_page) {
   1366        /*
   1367         * We've been once around the RAM and haven't found anything.
   1368         * Give up.
   1369         */
   1370        *again = false;
   1371        return false;
   1372    }
   1373    if (!offset_in_ramblock(pss->block,
   1374                            ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
   1375        /* Didn't find anything in this RAM Block */
   1376        pss->page = 0;
   1377        pss->block = QLIST_NEXT_RCU(pss->block, next);
   1378        if (!pss->block) {
   1379            /*
   1380             * If memory migration starts over, we will meet a dirtied page
   1381             * which may still exists in compression threads's ring, so we
   1382             * should flush the compressed data to make sure the new page
   1383             * is not overwritten by the old one in the destination.
   1384             *
   1385             * Also If xbzrle is on, stop using the data compression at this
   1386             * point. In theory, xbzrle can do better than compression.
   1387             */
   1388            flush_compressed_data(rs);
   1389
   1390            /* Hit the end of the list */
   1391            pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
   1392            /* Flag that we've looped */
   1393            pss->complete_round = true;
   1394            /* After the first round, enable XBZRLE. */
   1395            if (migrate_use_xbzrle()) {
   1396                rs->xbzrle_enabled = true;
   1397            }
   1398        }
   1399        /* Didn't find anything this time, but try again on the new block */
   1400        *again = true;
   1401        return false;
   1402    } else {
   1403        /* Can go around again, but... */
   1404        *again = true;
   1405        /* We've found something so probably don't need to */
   1406        return true;
   1407    }
   1408}
   1409
   1410/**
   1411 * unqueue_page: gets a page of the queue
   1412 *
   1413 * Helper for 'get_queued_page' - gets a page off the queue
   1414 *
   1415 * Returns the block of the page (or NULL if none available)
   1416 *
   1417 * @rs: current RAM state
   1418 * @offset: used to return the offset within the RAMBlock
   1419 */
   1420static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
   1421{
   1422    RAMBlock *block = NULL;
   1423
   1424    if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
   1425        return NULL;
   1426    }
   1427
   1428    QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
   1429    if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
   1430        struct RAMSrcPageRequest *entry =
   1431                                QSIMPLEQ_FIRST(&rs->src_page_requests);
   1432        block = entry->rb;
   1433        *offset = entry->offset;
   1434
   1435        if (entry->len > TARGET_PAGE_SIZE) {
   1436            entry->len -= TARGET_PAGE_SIZE;
   1437            entry->offset += TARGET_PAGE_SIZE;
   1438        } else {
   1439            memory_region_unref(block->mr);
   1440            QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
   1441            g_free(entry);
   1442            migration_consume_urgent_request();
   1443        }
   1444    }
   1445
   1446    return block;
   1447}
   1448
   1449#if defined(__linux__)
   1450/**
   1451 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
   1452 *   is found, return RAM block pointer and page offset
   1453 *
   1454 * Returns pointer to the RAMBlock containing faulting page,
   1455 *   NULL if no write faults are pending
   1456 *
   1457 * @rs: current RAM state
   1458 * @offset: page offset from the beginning of the block
   1459 */
   1460static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
   1461{
   1462    struct uffd_msg uffd_msg;
   1463    void *page_address;
   1464    RAMBlock *block;
   1465    int res;
   1466
   1467    if (!migrate_background_snapshot()) {
   1468        return NULL;
   1469    }
   1470
   1471    res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
   1472    if (res <= 0) {
   1473        return NULL;
   1474    }
   1475
   1476    page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
   1477    block = qemu_ram_block_from_host(page_address, false, offset);
   1478    assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
   1479    return block;
   1480}
   1481
   1482/**
   1483 * ram_save_release_protection: release UFFD write protection after
   1484 *   a range of pages has been saved
   1485 *
   1486 * @rs: current RAM state
   1487 * @pss: page-search-status structure
   1488 * @start_page: index of the first page in the range relative to pss->block
   1489 *
   1490 * Returns 0 on success, negative value in case of an error
   1491*/
   1492static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
   1493        unsigned long start_page)
   1494{
   1495    int res = 0;
   1496
   1497    /* Check if page is from UFFD-managed region. */
   1498    if (pss->block->flags & RAM_UF_WRITEPROTECT) {
   1499        void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
   1500        uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
   1501
   1502        /* Flush async buffers before un-protect. */
   1503        qemu_fflush(rs->f);
   1504        /* Un-protect memory range. */
   1505        res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
   1506                false, false);
   1507    }
   1508
   1509    return res;
   1510}
   1511
   1512/* ram_write_tracking_available: check if kernel supports required UFFD features
   1513 *
   1514 * Returns true if supports, false otherwise
   1515 */
   1516bool ram_write_tracking_available(void)
   1517{
   1518    uint64_t uffd_features;
   1519    int res;
   1520
   1521    res = uffd_query_features(&uffd_features);
   1522    return (res == 0 &&
   1523            (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
   1524}
   1525
   1526/* ram_write_tracking_compatible: check if guest configuration is
   1527 *   compatible with 'write-tracking'
   1528 *
   1529 * Returns true if compatible, false otherwise
   1530 */
   1531bool ram_write_tracking_compatible(void)
   1532{
   1533    const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
   1534    int uffd_fd;
   1535    RAMBlock *block;
   1536    bool ret = false;
   1537
   1538    /* Open UFFD file descriptor */
   1539    uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
   1540    if (uffd_fd < 0) {
   1541        return false;
   1542    }
   1543
   1544    RCU_READ_LOCK_GUARD();
   1545
   1546    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   1547        uint64_t uffd_ioctls;
   1548
   1549        /* Nothing to do with read-only and MMIO-writable regions */
   1550        if (block->mr->readonly || block->mr->rom_device) {
   1551            continue;
   1552        }
   1553        /* Try to register block memory via UFFD-IO to track writes */
   1554        if (uffd_register_memory(uffd_fd, block->host, block->max_length,
   1555                UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
   1556            goto out;
   1557        }
   1558        if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
   1559            goto out;
   1560        }
   1561    }
   1562    ret = true;
   1563
   1564out:
   1565    uffd_close_fd(uffd_fd);
   1566    return ret;
   1567}
   1568
   1569/*
   1570 * ram_block_populate_pages: populate memory in the RAM block by reading
   1571 *   an integer from the beginning of each page.
   1572 *
   1573 * Since it's solely used for userfault_fd WP feature, here we just
   1574 *   hardcode page size to qemu_real_host_page_size.
   1575 *
   1576 * @block: RAM block to populate
   1577 */
   1578static void ram_block_populate_pages(RAMBlock *block)
   1579{
   1580    char *ptr = (char *) block->host;
   1581
   1582    for (ram_addr_t offset = 0; offset < block->used_length;
   1583            offset += qemu_real_host_page_size) {
   1584        char tmp = *(ptr + offset);
   1585
   1586        /* Don't optimize the read out */
   1587        asm volatile("" : "+r" (tmp));
   1588    }
   1589}
   1590
   1591/*
   1592 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
   1593 */
   1594void ram_write_tracking_prepare(void)
   1595{
   1596    RAMBlock *block;
   1597
   1598    RCU_READ_LOCK_GUARD();
   1599
   1600    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   1601        /* Nothing to do with read-only and MMIO-writable regions */
   1602        if (block->mr->readonly || block->mr->rom_device) {
   1603            continue;
   1604        }
   1605
   1606        /*
   1607         * Populate pages of the RAM block before enabling userfault_fd
   1608         * write protection.
   1609         *
   1610         * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
   1611         * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
   1612         * pages with pte_none() entries in page table.
   1613         */
   1614        ram_block_populate_pages(block);
   1615    }
   1616}
   1617
   1618/*
   1619 * ram_write_tracking_start: start UFFD-WP memory tracking
   1620 *
   1621 * Returns 0 for success or negative value in case of error
   1622 */
   1623int ram_write_tracking_start(void)
   1624{
   1625    int uffd_fd;
   1626    RAMState *rs = ram_state;
   1627    RAMBlock *block;
   1628
   1629    /* Open UFFD file descriptor */
   1630    uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
   1631    if (uffd_fd < 0) {
   1632        return uffd_fd;
   1633    }
   1634    rs->uffdio_fd = uffd_fd;
   1635
   1636    RCU_READ_LOCK_GUARD();
   1637
   1638    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   1639        /* Nothing to do with read-only and MMIO-writable regions */
   1640        if (block->mr->readonly || block->mr->rom_device) {
   1641            continue;
   1642        }
   1643
   1644        /* Register block memory with UFFD to track writes */
   1645        if (uffd_register_memory(rs->uffdio_fd, block->host,
   1646                block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
   1647            goto fail;
   1648        }
   1649        /* Apply UFFD write protection to the block memory range */
   1650        if (uffd_change_protection(rs->uffdio_fd, block->host,
   1651                block->max_length, true, false)) {
   1652            goto fail;
   1653        }
   1654        block->flags |= RAM_UF_WRITEPROTECT;
   1655        memory_region_ref(block->mr);
   1656
   1657        trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
   1658                block->host, block->max_length);
   1659    }
   1660
   1661    return 0;
   1662
   1663fail:
   1664    error_report("ram_write_tracking_start() failed: restoring initial memory state");
   1665
   1666    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   1667        if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
   1668            continue;
   1669        }
   1670        /*
   1671         * In case some memory block failed to be write-protected
   1672         * remove protection and unregister all succeeded RAM blocks
   1673         */
   1674        uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
   1675                false, false);
   1676        uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
   1677        /* Cleanup flags and remove reference */
   1678        block->flags &= ~RAM_UF_WRITEPROTECT;
   1679        memory_region_unref(block->mr);
   1680    }
   1681
   1682    uffd_close_fd(uffd_fd);
   1683    rs->uffdio_fd = -1;
   1684    return -1;
   1685}
   1686
   1687/**
   1688 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
   1689 */
   1690void ram_write_tracking_stop(void)
   1691{
   1692    RAMState *rs = ram_state;
   1693    RAMBlock *block;
   1694
   1695    RCU_READ_LOCK_GUARD();
   1696
   1697    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   1698        if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
   1699            continue;
   1700        }
   1701        /* Remove protection and unregister all affected RAM blocks */
   1702        uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
   1703                false, false);
   1704        uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
   1705
   1706        trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
   1707                block->host, block->max_length);
   1708
   1709        /* Cleanup flags and remove reference */
   1710        block->flags &= ~RAM_UF_WRITEPROTECT;
   1711        memory_region_unref(block->mr);
   1712    }
   1713
   1714    /* Finally close UFFD file descriptor */
   1715    uffd_close_fd(rs->uffdio_fd);
   1716    rs->uffdio_fd = -1;
   1717}
   1718
   1719#else
   1720/* No target OS support, stubs just fail or ignore */
   1721
   1722static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
   1723{
   1724    (void) rs;
   1725    (void) offset;
   1726
   1727    return NULL;
   1728}
   1729
   1730static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
   1731        unsigned long start_page)
   1732{
   1733    (void) rs;
   1734    (void) pss;
   1735    (void) start_page;
   1736
   1737    return 0;
   1738}
   1739
   1740bool ram_write_tracking_available(void)
   1741{
   1742    return false;
   1743}
   1744
   1745bool ram_write_tracking_compatible(void)
   1746{
   1747    assert(0);
   1748    return false;
   1749}
   1750
   1751int ram_write_tracking_start(void)
   1752{
   1753    assert(0);
   1754    return -1;
   1755}
   1756
   1757void ram_write_tracking_stop(void)
   1758{
   1759    assert(0);
   1760}
   1761#endif /* defined(__linux__) */
   1762
   1763/**
   1764 * get_queued_page: unqueue a page from the postcopy requests
   1765 *
   1766 * Skips pages that are already sent (!dirty)
   1767 *
   1768 * Returns true if a queued page is found
   1769 *
   1770 * @rs: current RAM state
   1771 * @pss: data about the state of the current dirty page scan
   1772 */
   1773static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
   1774{
   1775    RAMBlock  *block;
   1776    ram_addr_t offset;
   1777    bool dirty;
   1778
   1779    do {
   1780        block = unqueue_page(rs, &offset);
   1781        /*
   1782         * We're sending this page, and since it's postcopy nothing else
   1783         * will dirty it, and we must make sure it doesn't get sent again
   1784         * even if this queue request was received after the background
   1785         * search already sent it.
   1786         */
   1787        if (block) {
   1788            unsigned long page;
   1789
   1790            page = offset >> TARGET_PAGE_BITS;
   1791            dirty = test_bit(page, block->bmap);
   1792            if (!dirty) {
   1793                trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
   1794                                                page);
   1795            } else {
   1796                trace_get_queued_page(block->idstr, (uint64_t)offset, page);
   1797            }
   1798        }
   1799
   1800    } while (block && !dirty);
   1801
   1802    if (!block) {
   1803        /*
   1804         * Poll write faults too if background snapshot is enabled; that's
   1805         * when we have vcpus got blocked by the write protected pages.
   1806         */
   1807        block = poll_fault_page(rs, &offset);
   1808    }
   1809
   1810    if (block) {
   1811        /*
   1812         * We want the background search to continue from the queued page
   1813         * since the guest is likely to want other pages near to the page
   1814         * it just requested.
   1815         */
   1816        pss->block = block;
   1817        pss->page = offset >> TARGET_PAGE_BITS;
   1818
   1819        /*
   1820         * This unqueued page would break the "one round" check, even is
   1821         * really rare.
   1822         */
   1823        pss->complete_round = false;
   1824    }
   1825
   1826    return !!block;
   1827}
   1828
   1829/**
   1830 * migration_page_queue_free: drop any remaining pages in the ram
   1831 * request queue
   1832 *
   1833 * It should be empty at the end anyway, but in error cases there may
   1834 * be some left.  in case that there is any page left, we drop it.
   1835 *
   1836 */
   1837static void migration_page_queue_free(RAMState *rs)
   1838{
   1839    struct RAMSrcPageRequest *mspr, *next_mspr;
   1840    /* This queue generally should be empty - but in the case of a failed
   1841     * migration might have some droppings in.
   1842     */
   1843    RCU_READ_LOCK_GUARD();
   1844    QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
   1845        memory_region_unref(mspr->rb->mr);
   1846        QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
   1847        g_free(mspr);
   1848    }
   1849}
   1850
   1851/**
   1852 * ram_save_queue_pages: queue the page for transmission
   1853 *
   1854 * A request from postcopy destination for example.
   1855 *
   1856 * Returns zero on success or negative on error
   1857 *
   1858 * @rbname: Name of the RAMBLock of the request. NULL means the
   1859 *          same that last one.
   1860 * @start: starting address from the start of the RAMBlock
   1861 * @len: length (in bytes) to send
   1862 */
   1863int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
   1864{
   1865    RAMBlock *ramblock;
   1866    RAMState *rs = ram_state;
   1867
   1868    ram_counters.postcopy_requests++;
   1869    RCU_READ_LOCK_GUARD();
   1870
   1871    if (!rbname) {
   1872        /* Reuse last RAMBlock */
   1873        ramblock = rs->last_req_rb;
   1874
   1875        if (!ramblock) {
   1876            /*
   1877             * Shouldn't happen, we can't reuse the last RAMBlock if
   1878             * it's the 1st request.
   1879             */
   1880            error_report("ram_save_queue_pages no previous block");
   1881            return -1;
   1882        }
   1883    } else {
   1884        ramblock = qemu_ram_block_by_name(rbname);
   1885
   1886        if (!ramblock) {
   1887            /* We shouldn't be asked for a non-existent RAMBlock */
   1888            error_report("ram_save_queue_pages no block '%s'", rbname);
   1889            return -1;
   1890        }
   1891        rs->last_req_rb = ramblock;
   1892    }
   1893    trace_ram_save_queue_pages(ramblock->idstr, start, len);
   1894    if (!offset_in_ramblock(ramblock, start + len - 1)) {
   1895        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
   1896                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
   1897                     __func__, start, len, ramblock->used_length);
   1898        return -1;
   1899    }
   1900
   1901    struct RAMSrcPageRequest *new_entry =
   1902        g_malloc0(sizeof(struct RAMSrcPageRequest));
   1903    new_entry->rb = ramblock;
   1904    new_entry->offset = start;
   1905    new_entry->len = len;
   1906
   1907    memory_region_ref(ramblock->mr);
   1908    qemu_mutex_lock(&rs->src_page_req_mutex);
   1909    QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
   1910    migration_make_urgent_request();
   1911    qemu_mutex_unlock(&rs->src_page_req_mutex);
   1912
   1913    return 0;
   1914}
   1915
   1916static bool save_page_use_compression(RAMState *rs)
   1917{
   1918    if (!migrate_use_compression()) {
   1919        return false;
   1920    }
   1921
   1922    /*
   1923     * If xbzrle is enabled (e.g., after first round of migration), stop
   1924     * using the data compression. In theory, xbzrle can do better than
   1925     * compression.
   1926     */
   1927    if (rs->xbzrle_enabled) {
   1928        return false;
   1929    }
   1930
   1931    return true;
   1932}
   1933
   1934/*
   1935 * try to compress the page before posting it out, return true if the page
   1936 * has been properly handled by compression, otherwise needs other
   1937 * paths to handle it
   1938 */
   1939static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
   1940{
   1941    if (!save_page_use_compression(rs)) {
   1942        return false;
   1943    }
   1944
   1945    /*
   1946     * When starting the process of a new block, the first page of
   1947     * the block should be sent out before other pages in the same
   1948     * block, and all the pages in last block should have been sent
   1949     * out, keeping this order is important, because the 'cont' flag
   1950     * is used to avoid resending the block name.
   1951     *
   1952     * We post the fist page as normal page as compression will take
   1953     * much CPU resource.
   1954     */
   1955    if (block != rs->last_sent_block) {
   1956        flush_compressed_data(rs);
   1957        return false;
   1958    }
   1959
   1960    if (compress_page_with_multi_thread(rs, block, offset) > 0) {
   1961        return true;
   1962    }
   1963
   1964    compression_counters.busy++;
   1965    return false;
   1966}
   1967
   1968/**
   1969 * ram_save_target_page: save one target page
   1970 *
   1971 * Returns the number of pages written
   1972 *
   1973 * @rs: current RAM state
   1974 * @pss: data about the page we want to send
   1975 * @last_stage: if we are at the completion stage
   1976 */
   1977static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
   1978                                bool last_stage)
   1979{
   1980    RAMBlock *block = pss->block;
   1981    ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
   1982    int res;
   1983
   1984    if (control_save_page(rs, block, offset, &res)) {
   1985        return res;
   1986    }
   1987
   1988    if (save_compress_page(rs, block, offset)) {
   1989        return 1;
   1990    }
   1991
   1992    res = save_zero_page(rs, block, offset);
   1993    if (res > 0) {
   1994        /* Must let xbzrle know, otherwise a previous (now 0'd) cached
   1995         * page would be stale
   1996         */
   1997        if (!save_page_use_compression(rs)) {
   1998            XBZRLE_cache_lock();
   1999            xbzrle_cache_zero_page(rs, block->offset + offset);
   2000            XBZRLE_cache_unlock();
   2001        }
   2002        ram_release_pages(block->idstr, offset, res);
   2003        return res;
   2004    }
   2005
   2006    /*
   2007     * Do not use multifd for:
   2008     * 1. Compression as the first page in the new block should be posted out
   2009     *    before sending the compressed page
   2010     * 2. In postcopy as one whole host page should be placed
   2011     */
   2012    if (!save_page_use_compression(rs) && migrate_use_multifd()
   2013        && !migration_in_postcopy()) {
   2014        return ram_save_multifd_page(rs, block, offset);
   2015    }
   2016
   2017    return ram_save_page(rs, pss, last_stage);
   2018}
   2019
   2020/**
   2021 * ram_save_host_page: save a whole host page
   2022 *
   2023 * Starting at *offset send pages up to the end of the current host
   2024 * page. It's valid for the initial offset to point into the middle of
   2025 * a host page in which case the remainder of the hostpage is sent.
   2026 * Only dirty target pages are sent. Note that the host page size may
   2027 * be a huge page for this block.
   2028 * The saving stops at the boundary of the used_length of the block
   2029 * if the RAMBlock isn't a multiple of the host page size.
   2030 *
   2031 * Returns the number of pages written or negative on error
   2032 *
   2033 * @rs: current RAM state
   2034 * @ms: current migration state
   2035 * @pss: data about the page we want to send
   2036 * @last_stage: if we are at the completion stage
   2037 */
   2038static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
   2039                              bool last_stage)
   2040{
   2041    int tmppages, pages = 0;
   2042    size_t pagesize_bits =
   2043        qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
   2044    unsigned long hostpage_boundary =
   2045        QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
   2046    unsigned long start_page = pss->page;
   2047    int res;
   2048
   2049    if (ramblock_is_ignored(pss->block)) {
   2050        error_report("block %s should not be migrated !", pss->block->idstr);
   2051        return 0;
   2052    }
   2053
   2054    do {
   2055        /* Check the pages is dirty and if it is send it */
   2056        if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
   2057            tmppages = ram_save_target_page(rs, pss, last_stage);
   2058            if (tmppages < 0) {
   2059                return tmppages;
   2060            }
   2061
   2062            pages += tmppages;
   2063            /*
   2064             * Allow rate limiting to happen in the middle of huge pages if
   2065             * something is sent in the current iteration.
   2066             */
   2067            if (pagesize_bits > 1 && tmppages > 0) {
   2068                migration_rate_limit();
   2069            }
   2070        }
   2071        pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
   2072    } while ((pss->page < hostpage_boundary) &&
   2073             offset_in_ramblock(pss->block,
   2074                                ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
   2075    /* The offset we leave with is the min boundary of host page and block */
   2076    pss->page = MIN(pss->page, hostpage_boundary) - 1;
   2077
   2078    res = ram_save_release_protection(rs, pss, start_page);
   2079    return (res < 0 ? res : pages);
   2080}
   2081
   2082/**
   2083 * ram_find_and_save_block: finds a dirty page and sends it to f
   2084 *
   2085 * Called within an RCU critical section.
   2086 *
   2087 * Returns the number of pages written where zero means no dirty pages,
   2088 * or negative on error
   2089 *
   2090 * @rs: current RAM state
   2091 * @last_stage: if we are at the completion stage
   2092 *
   2093 * On systems where host-page-size > target-page-size it will send all the
   2094 * pages in a host page that are dirty.
   2095 */
   2096
   2097static int ram_find_and_save_block(RAMState *rs, bool last_stage)
   2098{
   2099    PageSearchStatus pss;
   2100    int pages = 0;
   2101    bool again, found;
   2102
   2103    /* No dirty page as there is zero RAM */
   2104    if (!ram_bytes_total()) {
   2105        return pages;
   2106    }
   2107
   2108    pss.block = rs->last_seen_block;
   2109    pss.page = rs->last_page;
   2110    pss.complete_round = false;
   2111
   2112    if (!pss.block) {
   2113        pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
   2114    }
   2115
   2116    do {
   2117        again = true;
   2118        found = get_queued_page(rs, &pss);
   2119
   2120        if (!found) {
   2121            /* priority queue empty, so just search for something dirty */
   2122            found = find_dirty_block(rs, &pss, &again);
   2123        }
   2124
   2125        if (found) {
   2126            pages = ram_save_host_page(rs, &pss, last_stage);
   2127        }
   2128    } while (!pages && again);
   2129
   2130    rs->last_seen_block = pss.block;
   2131    rs->last_page = pss.page;
   2132
   2133    return pages;
   2134}
   2135
   2136void acct_update_position(QEMUFile *f, size_t size, bool zero)
   2137{
   2138    uint64_t pages = size / TARGET_PAGE_SIZE;
   2139
   2140    if (zero) {
   2141        ram_counters.duplicate += pages;
   2142    } else {
   2143        ram_counters.normal += pages;
   2144        ram_counters.transferred += size;
   2145        qemu_update_position(f, size);
   2146    }
   2147}
   2148
   2149static uint64_t ram_bytes_total_common(bool count_ignored)
   2150{
   2151    RAMBlock *block;
   2152    uint64_t total = 0;
   2153
   2154    RCU_READ_LOCK_GUARD();
   2155
   2156    if (count_ignored) {
   2157        RAMBLOCK_FOREACH_MIGRATABLE(block) {
   2158            total += block->used_length;
   2159        }
   2160    } else {
   2161        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   2162            total += block->used_length;
   2163        }
   2164    }
   2165    return total;
   2166}
   2167
   2168uint64_t ram_bytes_total(void)
   2169{
   2170    return ram_bytes_total_common(false);
   2171}
   2172
   2173static void xbzrle_load_setup(void)
   2174{
   2175    XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
   2176}
   2177
   2178static void xbzrle_load_cleanup(void)
   2179{
   2180    g_free(XBZRLE.decoded_buf);
   2181    XBZRLE.decoded_buf = NULL;
   2182}
   2183
   2184static void ram_state_cleanup(RAMState **rsp)
   2185{
   2186    if (*rsp) {
   2187        migration_page_queue_free(*rsp);
   2188        qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
   2189        qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
   2190        g_free(*rsp);
   2191        *rsp = NULL;
   2192    }
   2193}
   2194
   2195static void xbzrle_cleanup(void)
   2196{
   2197    XBZRLE_cache_lock();
   2198    if (XBZRLE.cache) {
   2199        cache_fini(XBZRLE.cache);
   2200        g_free(XBZRLE.encoded_buf);
   2201        g_free(XBZRLE.current_buf);
   2202        g_free(XBZRLE.zero_target_page);
   2203        XBZRLE.cache = NULL;
   2204        XBZRLE.encoded_buf = NULL;
   2205        XBZRLE.current_buf = NULL;
   2206        XBZRLE.zero_target_page = NULL;
   2207    }
   2208    XBZRLE_cache_unlock();
   2209}
   2210
   2211static void ram_save_cleanup(void *opaque)
   2212{
   2213    RAMState **rsp = opaque;
   2214    RAMBlock *block;
   2215
   2216    /* We don't use dirty log with background snapshots */
   2217    if (!migrate_background_snapshot()) {
   2218        /* caller have hold iothread lock or is in a bh, so there is
   2219         * no writing race against the migration bitmap
   2220         */
   2221        memory_global_dirty_log_stop();
   2222    }
   2223
   2224    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   2225        g_free(block->clear_bmap);
   2226        block->clear_bmap = NULL;
   2227        g_free(block->bmap);
   2228        block->bmap = NULL;
   2229    }
   2230
   2231    xbzrle_cleanup();
   2232    compress_threads_save_cleanup();
   2233    ram_state_cleanup(rsp);
   2234}
   2235
   2236static void ram_state_reset(RAMState *rs)
   2237{
   2238    rs->last_seen_block = NULL;
   2239    rs->last_sent_block = NULL;
   2240    rs->last_page = 0;
   2241    rs->last_version = ram_list.version;
   2242    rs->xbzrle_enabled = false;
   2243}
   2244
   2245#define MAX_WAIT 50 /* ms, half buffered_file limit */
   2246
   2247/*
   2248 * 'expected' is the value you expect the bitmap mostly to be full
   2249 * of; it won't bother printing lines that are all this value.
   2250 * If 'todump' is null the migration bitmap is dumped.
   2251 */
   2252void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
   2253                           unsigned long pages)
   2254{
   2255    int64_t cur;
   2256    int64_t linelen = 128;
   2257    char linebuf[129];
   2258
   2259    for (cur = 0; cur < pages; cur += linelen) {
   2260        int64_t curb;
   2261        bool found = false;
   2262        /*
   2263         * Last line; catch the case where the line length
   2264         * is longer than remaining ram
   2265         */
   2266        if (cur + linelen > pages) {
   2267            linelen = pages - cur;
   2268        }
   2269        for (curb = 0; curb < linelen; curb++) {
   2270            bool thisbit = test_bit(cur + curb, todump);
   2271            linebuf[curb] = thisbit ? '1' : '.';
   2272            found = found || (thisbit != expected);
   2273        }
   2274        if (found) {
   2275            linebuf[curb] = '\0';
   2276            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
   2277        }
   2278    }
   2279}
   2280
   2281/* **** functions for postcopy ***** */
   2282
   2283void ram_postcopy_migrated_memory_release(MigrationState *ms)
   2284{
   2285    struct RAMBlock *block;
   2286
   2287    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   2288        unsigned long *bitmap = block->bmap;
   2289        unsigned long range = block->used_length >> TARGET_PAGE_BITS;
   2290        unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
   2291
   2292        while (run_start < range) {
   2293            unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
   2294            ram_discard_range(block->idstr,
   2295                              ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
   2296                              ((ram_addr_t)(run_end - run_start))
   2297                                << TARGET_PAGE_BITS);
   2298            run_start = find_next_zero_bit(bitmap, range, run_end + 1);
   2299        }
   2300    }
   2301}
   2302
   2303/**
   2304 * postcopy_send_discard_bm_ram: discard a RAMBlock
   2305 *
   2306 * Returns zero on success
   2307 *
   2308 * Callback from postcopy_each_ram_send_discard for each RAMBlock
   2309 *
   2310 * @ms: current migration state
   2311 * @block: RAMBlock to discard
   2312 */
   2313static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
   2314{
   2315    unsigned long end = block->used_length >> TARGET_PAGE_BITS;
   2316    unsigned long current;
   2317    unsigned long *bitmap = block->bmap;
   2318
   2319    for (current = 0; current < end; ) {
   2320        unsigned long one = find_next_bit(bitmap, end, current);
   2321        unsigned long zero, discard_length;
   2322
   2323        if (one >= end) {
   2324            break;
   2325        }
   2326
   2327        zero = find_next_zero_bit(bitmap, end, one + 1);
   2328
   2329        if (zero >= end) {
   2330            discard_length = end - one;
   2331        } else {
   2332            discard_length = zero - one;
   2333        }
   2334        postcopy_discard_send_range(ms, one, discard_length);
   2335        current = one + discard_length;
   2336    }
   2337
   2338    return 0;
   2339}
   2340
   2341/**
   2342 * postcopy_each_ram_send_discard: discard all RAMBlocks
   2343 *
   2344 * Returns 0 for success or negative for error
   2345 *
   2346 * Utility for the outgoing postcopy code.
   2347 *   Calls postcopy_send_discard_bm_ram for each RAMBlock
   2348 *   passing it bitmap indexes and name.
   2349 * (qemu_ram_foreach_block ends up passing unscaled lengths
   2350 *  which would mean postcopy code would have to deal with target page)
   2351 *
   2352 * @ms: current migration state
   2353 */
   2354static int postcopy_each_ram_send_discard(MigrationState *ms)
   2355{
   2356    struct RAMBlock *block;
   2357    int ret;
   2358
   2359    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   2360        postcopy_discard_send_init(ms, block->idstr);
   2361
   2362        /*
   2363         * Postcopy sends chunks of bitmap over the wire, but it
   2364         * just needs indexes at this point, avoids it having
   2365         * target page specific code.
   2366         */
   2367        ret = postcopy_send_discard_bm_ram(ms, block);
   2368        postcopy_discard_send_finish(ms);
   2369        if (ret) {
   2370            return ret;
   2371        }
   2372    }
   2373
   2374    return 0;
   2375}
   2376
   2377/**
   2378 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
   2379 *
   2380 * Helper for postcopy_chunk_hostpages; it's called twice to
   2381 * canonicalize the two bitmaps, that are similar, but one is
   2382 * inverted.
   2383 *
   2384 * Postcopy requires that all target pages in a hostpage are dirty or
   2385 * clean, not a mix.  This function canonicalizes the bitmaps.
   2386 *
   2387 * @ms: current migration state
   2388 * @block: block that contains the page we want to canonicalize
   2389 */
   2390static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
   2391{
   2392    RAMState *rs = ram_state;
   2393    unsigned long *bitmap = block->bmap;
   2394    unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
   2395    unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
   2396    unsigned long run_start;
   2397
   2398    if (block->page_size == TARGET_PAGE_SIZE) {
   2399        /* Easy case - TPS==HPS for a non-huge page RAMBlock */
   2400        return;
   2401    }
   2402
   2403    /* Find a dirty page */
   2404    run_start = find_next_bit(bitmap, pages, 0);
   2405
   2406    while (run_start < pages) {
   2407
   2408        /*
   2409         * If the start of this run of pages is in the middle of a host
   2410         * page, then we need to fixup this host page.
   2411         */
   2412        if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
   2413            /* Find the end of this run */
   2414            run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
   2415            /*
   2416             * If the end isn't at the start of a host page, then the
   2417             * run doesn't finish at the end of a host page
   2418             * and we need to discard.
   2419             */
   2420        }
   2421
   2422        if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
   2423            unsigned long page;
   2424            unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
   2425                                                             host_ratio);
   2426            run_start = QEMU_ALIGN_UP(run_start, host_ratio);
   2427
   2428            /* Clean up the bitmap */
   2429            for (page = fixup_start_addr;
   2430                 page < fixup_start_addr + host_ratio; page++) {
   2431                /*
   2432                 * Remark them as dirty, updating the count for any pages
   2433                 * that weren't previously dirty.
   2434                 */
   2435                rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
   2436            }
   2437        }
   2438
   2439        /* Find the next dirty page for the next iteration */
   2440        run_start = find_next_bit(bitmap, pages, run_start);
   2441    }
   2442}
   2443
   2444/**
   2445 * postcopy_chunk_hostpages: discard any partially sent host page
   2446 *
   2447 * Utility for the outgoing postcopy code.
   2448 *
   2449 * Discard any partially sent host-page size chunks, mark any partially
   2450 * dirty host-page size chunks as all dirty.  In this case the host-page
   2451 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
   2452 *
   2453 * Returns zero on success
   2454 *
   2455 * @ms: current migration state
   2456 * @block: block we want to work with
   2457 */
   2458static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
   2459{
   2460    postcopy_discard_send_init(ms, block->idstr);
   2461
   2462    /*
   2463     * Ensure that all partially dirty host pages are made fully dirty.
   2464     */
   2465    postcopy_chunk_hostpages_pass(ms, block);
   2466
   2467    postcopy_discard_send_finish(ms);
   2468    return 0;
   2469}
   2470
   2471/**
   2472 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
   2473 *
   2474 * Returns zero on success
   2475 *
   2476 * Transmit the set of pages to be discarded after precopy to the target
   2477 * these are pages that:
   2478 *     a) Have been previously transmitted but are now dirty again
   2479 *     b) Pages that have never been transmitted, this ensures that
   2480 *        any pages on the destination that have been mapped by background
   2481 *        tasks get discarded (transparent huge pages is the specific concern)
   2482 * Hopefully this is pretty sparse
   2483 *
   2484 * @ms: current migration state
   2485 */
   2486int ram_postcopy_send_discard_bitmap(MigrationState *ms)
   2487{
   2488    RAMState *rs = ram_state;
   2489    RAMBlock *block;
   2490    int ret;
   2491
   2492    RCU_READ_LOCK_GUARD();
   2493
   2494    /* This should be our last sync, the src is now paused */
   2495    migration_bitmap_sync(rs);
   2496
   2497    /* Easiest way to make sure we don't resume in the middle of a host-page */
   2498    rs->last_seen_block = NULL;
   2499    rs->last_sent_block = NULL;
   2500    rs->last_page = 0;
   2501
   2502    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   2503        /* Deal with TPS != HPS and huge pages */
   2504        ret = postcopy_chunk_hostpages(ms, block);
   2505        if (ret) {
   2506            return ret;
   2507        }
   2508
   2509#ifdef DEBUG_POSTCOPY
   2510        ram_debug_dump_bitmap(block->bmap, true,
   2511                              block->used_length >> TARGET_PAGE_BITS);
   2512#endif
   2513    }
   2514    trace_ram_postcopy_send_discard_bitmap();
   2515
   2516    return postcopy_each_ram_send_discard(ms);
   2517}
   2518
   2519/**
   2520 * ram_discard_range: discard dirtied pages at the beginning of postcopy
   2521 *
   2522 * Returns zero on success
   2523 *
   2524 * @rbname: name of the RAMBlock of the request. NULL means the
   2525 *          same that last one.
   2526 * @start: RAMBlock starting page
   2527 * @length: RAMBlock size
   2528 */
   2529int ram_discard_range(const char *rbname, uint64_t start, size_t length)
   2530{
   2531    trace_ram_discard_range(rbname, start, length);
   2532
   2533    RCU_READ_LOCK_GUARD();
   2534    RAMBlock *rb = qemu_ram_block_by_name(rbname);
   2535
   2536    if (!rb) {
   2537        error_report("ram_discard_range: Failed to find block '%s'", rbname);
   2538        return -1;
   2539    }
   2540
   2541    /*
   2542     * On source VM, we don't need to update the received bitmap since
   2543     * we don't even have one.
   2544     */
   2545    if (rb->receivedmap) {
   2546        bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
   2547                     length >> qemu_target_page_bits());
   2548    }
   2549
   2550    return ram_block_discard_range(rb, start, length);
   2551}
   2552
   2553/*
   2554 * For every allocation, we will try not to crash the VM if the
   2555 * allocation failed.
   2556 */
   2557static int xbzrle_init(void)
   2558{
   2559    Error *local_err = NULL;
   2560
   2561    if (!migrate_use_xbzrle()) {
   2562        return 0;
   2563    }
   2564
   2565    XBZRLE_cache_lock();
   2566
   2567    XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
   2568    if (!XBZRLE.zero_target_page) {
   2569        error_report("%s: Error allocating zero page", __func__);
   2570        goto err_out;
   2571    }
   2572
   2573    XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
   2574                              TARGET_PAGE_SIZE, &local_err);
   2575    if (!XBZRLE.cache) {
   2576        error_report_err(local_err);
   2577        goto free_zero_page;
   2578    }
   2579
   2580    XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
   2581    if (!XBZRLE.encoded_buf) {
   2582        error_report("%s: Error allocating encoded_buf", __func__);
   2583        goto free_cache;
   2584    }
   2585
   2586    XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
   2587    if (!XBZRLE.current_buf) {
   2588        error_report("%s: Error allocating current_buf", __func__);
   2589        goto free_encoded_buf;
   2590    }
   2591
   2592    /* We are all good */
   2593    XBZRLE_cache_unlock();
   2594    return 0;
   2595
   2596free_encoded_buf:
   2597    g_free(XBZRLE.encoded_buf);
   2598    XBZRLE.encoded_buf = NULL;
   2599free_cache:
   2600    cache_fini(XBZRLE.cache);
   2601    XBZRLE.cache = NULL;
   2602free_zero_page:
   2603    g_free(XBZRLE.zero_target_page);
   2604    XBZRLE.zero_target_page = NULL;
   2605err_out:
   2606    XBZRLE_cache_unlock();
   2607    return -ENOMEM;
   2608}
   2609
   2610static int ram_state_init(RAMState **rsp)
   2611{
   2612    *rsp = g_try_new0(RAMState, 1);
   2613
   2614    if (!*rsp) {
   2615        error_report("%s: Init ramstate fail", __func__);
   2616        return -1;
   2617    }
   2618
   2619    qemu_mutex_init(&(*rsp)->bitmap_mutex);
   2620    qemu_mutex_init(&(*rsp)->src_page_req_mutex);
   2621    QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
   2622
   2623    /*
   2624     * Count the total number of pages used by ram blocks not including any
   2625     * gaps due to alignment or unplugs.
   2626     * This must match with the initial values of dirty bitmap.
   2627     */
   2628    (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
   2629    ram_state_reset(*rsp);
   2630
   2631    return 0;
   2632}
   2633
   2634static void ram_list_init_bitmaps(void)
   2635{
   2636    MigrationState *ms = migrate_get_current();
   2637    RAMBlock *block;
   2638    unsigned long pages;
   2639    uint8_t shift;
   2640
   2641    /* Skip setting bitmap if there is no RAM */
   2642    if (ram_bytes_total()) {
   2643        shift = ms->clear_bitmap_shift;
   2644        if (shift > CLEAR_BITMAP_SHIFT_MAX) {
   2645            error_report("clear_bitmap_shift (%u) too big, using "
   2646                         "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
   2647            shift = CLEAR_BITMAP_SHIFT_MAX;
   2648        } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
   2649            error_report("clear_bitmap_shift (%u) too small, using "
   2650                         "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
   2651            shift = CLEAR_BITMAP_SHIFT_MIN;
   2652        }
   2653
   2654        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   2655            pages = block->max_length >> TARGET_PAGE_BITS;
   2656            /*
   2657             * The initial dirty bitmap for migration must be set with all
   2658             * ones to make sure we'll migrate every guest RAM page to
   2659             * destination.
   2660             * Here we set RAMBlock.bmap all to 1 because when rebegin a
   2661             * new migration after a failed migration, ram_list.
   2662             * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
   2663             * guest memory.
   2664             */
   2665            block->bmap = bitmap_new(pages);
   2666            bitmap_set(block->bmap, 0, pages);
   2667            block->clear_bmap_shift = shift;
   2668            block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
   2669        }
   2670    }
   2671}
   2672
   2673static void ram_init_bitmaps(RAMState *rs)
   2674{
   2675    /* For memory_global_dirty_log_start below.  */
   2676    qemu_mutex_lock_iothread();
   2677    qemu_mutex_lock_ramlist();
   2678
   2679    WITH_RCU_READ_LOCK_GUARD() {
   2680        ram_list_init_bitmaps();
   2681        /* We don't use dirty log with background snapshots */
   2682        if (!migrate_background_snapshot()) {
   2683            memory_global_dirty_log_start();
   2684            migration_bitmap_sync_precopy(rs);
   2685        }
   2686    }
   2687    qemu_mutex_unlock_ramlist();
   2688    qemu_mutex_unlock_iothread();
   2689}
   2690
   2691static int ram_init_all(RAMState **rsp)
   2692{
   2693    if (ram_state_init(rsp)) {
   2694        return -1;
   2695    }
   2696
   2697    if (xbzrle_init()) {
   2698        ram_state_cleanup(rsp);
   2699        return -1;
   2700    }
   2701
   2702    ram_init_bitmaps(*rsp);
   2703
   2704    return 0;
   2705}
   2706
   2707static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
   2708{
   2709    RAMBlock *block;
   2710    uint64_t pages = 0;
   2711
   2712    /*
   2713     * Postcopy is not using xbzrle/compression, so no need for that.
   2714     * Also, since source are already halted, we don't need to care
   2715     * about dirty page logging as well.
   2716     */
   2717
   2718    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   2719        pages += bitmap_count_one(block->bmap,
   2720                                  block->used_length >> TARGET_PAGE_BITS);
   2721    }
   2722
   2723    /* This may not be aligned with current bitmaps. Recalculate. */
   2724    rs->migration_dirty_pages = pages;
   2725
   2726    ram_state_reset(rs);
   2727
   2728    /* Update RAMState cache of output QEMUFile */
   2729    rs->f = out;
   2730
   2731    trace_ram_state_resume_prepare(pages);
   2732}
   2733
   2734/*
   2735 * This function clears bits of the free pages reported by the caller from the
   2736 * migration dirty bitmap. @addr is the host address corresponding to the
   2737 * start of the continuous guest free pages, and @len is the total bytes of
   2738 * those pages.
   2739 */
   2740void qemu_guest_free_page_hint(void *addr, size_t len)
   2741{
   2742    RAMBlock *block;
   2743    ram_addr_t offset;
   2744    size_t used_len, start, npages;
   2745    MigrationState *s = migrate_get_current();
   2746
   2747    /* This function is currently expected to be used during live migration */
   2748    if (!migration_is_setup_or_active(s->state)) {
   2749        return;
   2750    }
   2751
   2752    for (; len > 0; len -= used_len, addr += used_len) {
   2753        block = qemu_ram_block_from_host(addr, false, &offset);
   2754        if (unlikely(!block || offset >= block->used_length)) {
   2755            /*
   2756             * The implementation might not support RAMBlock resize during
   2757             * live migration, but it could happen in theory with future
   2758             * updates. So we add a check here to capture that case.
   2759             */
   2760            error_report_once("%s unexpected error", __func__);
   2761            return;
   2762        }
   2763
   2764        if (len <= block->used_length - offset) {
   2765            used_len = len;
   2766        } else {
   2767            used_len = block->used_length - offset;
   2768        }
   2769
   2770        start = offset >> TARGET_PAGE_BITS;
   2771        npages = used_len >> TARGET_PAGE_BITS;
   2772
   2773        qemu_mutex_lock(&ram_state->bitmap_mutex);
   2774        /*
   2775         * The skipped free pages are equavalent to be sent from clear_bmap's
   2776         * perspective, so clear the bits from the memory region bitmap which
   2777         * are initially set. Otherwise those skipped pages will be sent in
   2778         * the next round after syncing from the memory region bitmap.
   2779         */
   2780        migration_clear_memory_region_dirty_bitmap_range(ram_state, block,
   2781                                                         start, npages);
   2782        ram_state->migration_dirty_pages -=
   2783                      bitmap_count_one_with_offset(block->bmap, start, npages);
   2784        bitmap_clear(block->bmap, start, npages);
   2785        qemu_mutex_unlock(&ram_state->bitmap_mutex);
   2786    }
   2787}
   2788
   2789/*
   2790 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
   2791 * long-running RCU critical section.  When rcu-reclaims in the code
   2792 * start to become numerous it will be necessary to reduce the
   2793 * granularity of these critical sections.
   2794 */
   2795
   2796/**
   2797 * ram_save_setup: Setup RAM for migration
   2798 *
   2799 * Returns zero to indicate success and negative for error
   2800 *
   2801 * @f: QEMUFile where to send the data
   2802 * @opaque: RAMState pointer
   2803 */
   2804static int ram_save_setup(QEMUFile *f, void *opaque)
   2805{
   2806    RAMState **rsp = opaque;
   2807    RAMBlock *block;
   2808
   2809    if (compress_threads_save_setup()) {
   2810        return -1;
   2811    }
   2812
   2813    /* migration has already setup the bitmap, reuse it. */
   2814    if (!migration_in_colo_state()) {
   2815        if (ram_init_all(rsp) != 0) {
   2816            compress_threads_save_cleanup();
   2817            return -1;
   2818        }
   2819    }
   2820    (*rsp)->f = f;
   2821
   2822    WITH_RCU_READ_LOCK_GUARD() {
   2823        qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
   2824
   2825        RAMBLOCK_FOREACH_MIGRATABLE(block) {
   2826            qemu_put_byte(f, strlen(block->idstr));
   2827            qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
   2828            qemu_put_be64(f, block->used_length);
   2829            if (migrate_postcopy_ram() && block->page_size !=
   2830                                          qemu_host_page_size) {
   2831                qemu_put_be64(f, block->page_size);
   2832            }
   2833            if (migrate_ignore_shared()) {
   2834                qemu_put_be64(f, block->mr->addr);
   2835            }
   2836        }
   2837    }
   2838
   2839    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
   2840    ram_control_after_iterate(f, RAM_CONTROL_SETUP);
   2841
   2842    multifd_send_sync_main(f);
   2843    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
   2844    qemu_fflush(f);
   2845
   2846    return 0;
   2847}
   2848
   2849/**
   2850 * ram_save_iterate: iterative stage for migration
   2851 *
   2852 * Returns zero to indicate success and negative for error
   2853 *
   2854 * @f: QEMUFile where to send the data
   2855 * @opaque: RAMState pointer
   2856 */
   2857static int ram_save_iterate(QEMUFile *f, void *opaque)
   2858{
   2859    RAMState **temp = opaque;
   2860    RAMState *rs = *temp;
   2861    int ret = 0;
   2862    int i;
   2863    int64_t t0;
   2864    int done = 0;
   2865
   2866    if (blk_mig_bulk_active()) {
   2867        /* Avoid transferring ram during bulk phase of block migration as
   2868         * the bulk phase will usually take a long time and transferring
   2869         * ram updates during that time is pointless. */
   2870        goto out;
   2871    }
   2872
   2873    /*
   2874     * We'll take this lock a little bit long, but it's okay for two reasons.
   2875     * Firstly, the only possible other thread to take it is who calls
   2876     * qemu_guest_free_page_hint(), which should be rare; secondly, see
   2877     * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
   2878     * guarantees that we'll at least released it in a regular basis.
   2879     */
   2880    qemu_mutex_lock(&rs->bitmap_mutex);
   2881    WITH_RCU_READ_LOCK_GUARD() {
   2882        if (ram_list.version != rs->last_version) {
   2883            ram_state_reset(rs);
   2884        }
   2885
   2886        /* Read version before ram_list.blocks */
   2887        smp_rmb();
   2888
   2889        ram_control_before_iterate(f, RAM_CONTROL_ROUND);
   2890
   2891        t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
   2892        i = 0;
   2893        while ((ret = qemu_file_rate_limit(f)) == 0 ||
   2894                !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
   2895            int pages;
   2896
   2897            if (qemu_file_get_error(f)) {
   2898                break;
   2899            }
   2900
   2901            pages = ram_find_and_save_block(rs, false);
   2902            /* no more pages to sent */
   2903            if (pages == 0) {
   2904                done = 1;
   2905                break;
   2906            }
   2907
   2908            if (pages < 0) {
   2909                qemu_file_set_error(f, pages);
   2910                break;
   2911            }
   2912
   2913            rs->target_page_count += pages;
   2914
   2915            /*
   2916             * During postcopy, it is necessary to make sure one whole host
   2917             * page is sent in one chunk.
   2918             */
   2919            if (migrate_postcopy_ram()) {
   2920                flush_compressed_data(rs);
   2921            }
   2922
   2923            /*
   2924             * we want to check in the 1st loop, just in case it was the 1st
   2925             * time and we had to sync the dirty bitmap.
   2926             * qemu_clock_get_ns() is a bit expensive, so we only check each
   2927             * some iterations
   2928             */
   2929            if ((i & 63) == 0) {
   2930                uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
   2931                              1000000;
   2932                if (t1 > MAX_WAIT) {
   2933                    trace_ram_save_iterate_big_wait(t1, i);
   2934                    break;
   2935                }
   2936            }
   2937            i++;
   2938        }
   2939    }
   2940    qemu_mutex_unlock(&rs->bitmap_mutex);
   2941
   2942    /*
   2943     * Must occur before EOS (or any QEMUFile operation)
   2944     * because of RDMA protocol.
   2945     */
   2946    ram_control_after_iterate(f, RAM_CONTROL_ROUND);
   2947
   2948out:
   2949    if (ret >= 0
   2950        && migration_is_setup_or_active(migrate_get_current()->state)) {
   2951        multifd_send_sync_main(rs->f);
   2952        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
   2953        qemu_fflush(f);
   2954        ram_counters.transferred += 8;
   2955
   2956        ret = qemu_file_get_error(f);
   2957    }
   2958    if (ret < 0) {
   2959        return ret;
   2960    }
   2961
   2962    return done;
   2963}
   2964
   2965/**
   2966 * ram_save_complete: function called to send the remaining amount of ram
   2967 *
   2968 * Returns zero to indicate success or negative on error
   2969 *
   2970 * Called with iothread lock
   2971 *
   2972 * @f: QEMUFile where to send the data
   2973 * @opaque: RAMState pointer
   2974 */
   2975static int ram_save_complete(QEMUFile *f, void *opaque)
   2976{
   2977    RAMState **temp = opaque;
   2978    RAMState *rs = *temp;
   2979    int ret = 0;
   2980
   2981    WITH_RCU_READ_LOCK_GUARD() {
   2982        if (!migration_in_postcopy()) {
   2983            migration_bitmap_sync_precopy(rs);
   2984        }
   2985
   2986        ram_control_before_iterate(f, RAM_CONTROL_FINISH);
   2987
   2988        /* try transferring iterative blocks of memory */
   2989
   2990        /* flush all remaining blocks regardless of rate limiting */
   2991        while (true) {
   2992            int pages;
   2993
   2994            pages = ram_find_and_save_block(rs, !migration_in_colo_state());
   2995            /* no more blocks to sent */
   2996            if (pages == 0) {
   2997                break;
   2998            }
   2999            if (pages < 0) {
   3000                ret = pages;
   3001                break;
   3002            }
   3003        }
   3004
   3005        flush_compressed_data(rs);
   3006        ram_control_after_iterate(f, RAM_CONTROL_FINISH);
   3007    }
   3008
   3009    if (ret >= 0) {
   3010        multifd_send_sync_main(rs->f);
   3011        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
   3012        qemu_fflush(f);
   3013    }
   3014
   3015    return ret;
   3016}
   3017
   3018static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
   3019                             uint64_t *res_precopy_only,
   3020                             uint64_t *res_compatible,
   3021                             uint64_t *res_postcopy_only)
   3022{
   3023    RAMState **temp = opaque;
   3024    RAMState *rs = *temp;
   3025    uint64_t remaining_size;
   3026
   3027    remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
   3028
   3029    if (!migration_in_postcopy() &&
   3030        remaining_size < max_size) {
   3031        qemu_mutex_lock_iothread();
   3032        WITH_RCU_READ_LOCK_GUARD() {
   3033            migration_bitmap_sync_precopy(rs);
   3034        }
   3035        qemu_mutex_unlock_iothread();
   3036        remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
   3037    }
   3038
   3039    if (migrate_postcopy_ram()) {
   3040        /* We can do postcopy, and all the data is postcopiable */
   3041        *res_compatible += remaining_size;
   3042    } else {
   3043        *res_precopy_only += remaining_size;
   3044    }
   3045}
   3046
   3047static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
   3048{
   3049    unsigned int xh_len;
   3050    int xh_flags;
   3051    uint8_t *loaded_data;
   3052
   3053    /* extract RLE header */
   3054    xh_flags = qemu_get_byte(f);
   3055    xh_len = qemu_get_be16(f);
   3056
   3057    if (xh_flags != ENCODING_FLAG_XBZRLE) {
   3058        error_report("Failed to load XBZRLE page - wrong compression!");
   3059        return -1;
   3060    }
   3061
   3062    if (xh_len > TARGET_PAGE_SIZE) {
   3063        error_report("Failed to load XBZRLE page - len overflow!");
   3064        return -1;
   3065    }
   3066    loaded_data = XBZRLE.decoded_buf;
   3067    /* load data and decode */
   3068    /* it can change loaded_data to point to an internal buffer */
   3069    qemu_get_buffer_in_place(f, &loaded_data, xh_len);
   3070
   3071    /* decode RLE */
   3072    if (xbzrle_decode_buffer(loaded_data, xh_len, host,
   3073                             TARGET_PAGE_SIZE) == -1) {
   3074        error_report("Failed to load XBZRLE page - decode error!");
   3075        return -1;
   3076    }
   3077
   3078    return 0;
   3079}
   3080
   3081/**
   3082 * ram_block_from_stream: read a RAMBlock id from the migration stream
   3083 *
   3084 * Must be called from within a rcu critical section.
   3085 *
   3086 * Returns a pointer from within the RCU-protected ram_list.
   3087 *
   3088 * @f: QEMUFile where to read the data from
   3089 * @flags: Page flags (mostly to see if it's a continuation of previous block)
   3090 */
   3091static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
   3092{
   3093    static RAMBlock *block;
   3094    char id[256];
   3095    uint8_t len;
   3096
   3097    if (flags & RAM_SAVE_FLAG_CONTINUE) {
   3098        if (!block) {
   3099            error_report("Ack, bad migration stream!");
   3100            return NULL;
   3101        }
   3102        return block;
   3103    }
   3104
   3105    len = qemu_get_byte(f);
   3106    qemu_get_buffer(f, (uint8_t *)id, len);
   3107    id[len] = 0;
   3108
   3109    block = qemu_ram_block_by_name(id);
   3110    if (!block) {
   3111        error_report("Can't find block %s", id);
   3112        return NULL;
   3113    }
   3114
   3115    if (ramblock_is_ignored(block)) {
   3116        error_report("block %s should not be migrated !", id);
   3117        return NULL;
   3118    }
   3119
   3120    return block;
   3121}
   3122
   3123static inline void *host_from_ram_block_offset(RAMBlock *block,
   3124                                               ram_addr_t offset)
   3125{
   3126    if (!offset_in_ramblock(block, offset)) {
   3127        return NULL;
   3128    }
   3129
   3130    return block->host + offset;
   3131}
   3132
   3133static void *host_page_from_ram_block_offset(RAMBlock *block,
   3134                                             ram_addr_t offset)
   3135{
   3136    /* Note: Explicitly no check against offset_in_ramblock(). */
   3137    return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
   3138                                   block->page_size);
   3139}
   3140
   3141static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
   3142                                                         ram_addr_t offset)
   3143{
   3144    return ((uintptr_t)block->host + offset) & (block->page_size - 1);
   3145}
   3146
   3147static inline void *colo_cache_from_block_offset(RAMBlock *block,
   3148                             ram_addr_t offset, bool record_bitmap)
   3149{
   3150    if (!offset_in_ramblock(block, offset)) {
   3151        return NULL;
   3152    }
   3153    if (!block->colo_cache) {
   3154        error_report("%s: colo_cache is NULL in block :%s",
   3155                     __func__, block->idstr);
   3156        return NULL;
   3157    }
   3158
   3159    /*
   3160    * During colo checkpoint, we need bitmap of these migrated pages.
   3161    * It help us to decide which pages in ram cache should be flushed
   3162    * into VM's RAM later.
   3163    */
   3164    if (record_bitmap &&
   3165        !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
   3166        ram_state->migration_dirty_pages++;
   3167    }
   3168    return block->colo_cache + offset;
   3169}
   3170
   3171/**
   3172 * ram_handle_compressed: handle the zero page case
   3173 *
   3174 * If a page (or a whole RDMA chunk) has been
   3175 * determined to be zero, then zap it.
   3176 *
   3177 * @host: host address for the zero page
   3178 * @ch: what the page is filled from.  We only support zero
   3179 * @size: size of the zero page
   3180 */
   3181void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
   3182{
   3183    if (ch != 0 || !is_zero_range(host, size)) {
   3184        memset(host, ch, size);
   3185    }
   3186}
   3187
   3188/* return the size after decompression, or negative value on error */
   3189static int
   3190qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
   3191                     const uint8_t *source, size_t source_len)
   3192{
   3193    int err;
   3194
   3195    err = inflateReset(stream);
   3196    if (err != Z_OK) {
   3197        return -1;
   3198    }
   3199
   3200    stream->avail_in = source_len;
   3201    stream->next_in = (uint8_t *)source;
   3202    stream->avail_out = dest_len;
   3203    stream->next_out = dest;
   3204
   3205    err = inflate(stream, Z_NO_FLUSH);
   3206    if (err != Z_STREAM_END) {
   3207        return -1;
   3208    }
   3209
   3210    return stream->total_out;
   3211}
   3212
   3213static void *do_data_decompress(void *opaque)
   3214{
   3215    DecompressParam *param = opaque;
   3216    unsigned long pagesize;
   3217    uint8_t *des;
   3218    int len, ret;
   3219
   3220    qemu_mutex_lock(&param->mutex);
   3221    while (!param->quit) {
   3222        if (param->des) {
   3223            des = param->des;
   3224            len = param->len;
   3225            param->des = 0;
   3226            qemu_mutex_unlock(&param->mutex);
   3227
   3228            pagesize = TARGET_PAGE_SIZE;
   3229
   3230            ret = qemu_uncompress_data(&param->stream, des, pagesize,
   3231                                       param->compbuf, len);
   3232            if (ret < 0 && migrate_get_current()->decompress_error_check) {
   3233                error_report("decompress data failed");
   3234                qemu_file_set_error(decomp_file, ret);
   3235            }
   3236
   3237            qemu_mutex_lock(&decomp_done_lock);
   3238            param->done = true;
   3239            qemu_cond_signal(&decomp_done_cond);
   3240            qemu_mutex_unlock(&decomp_done_lock);
   3241
   3242            qemu_mutex_lock(&param->mutex);
   3243        } else {
   3244            qemu_cond_wait(&param->cond, &param->mutex);
   3245        }
   3246    }
   3247    qemu_mutex_unlock(&param->mutex);
   3248
   3249    return NULL;
   3250}
   3251
   3252static int wait_for_decompress_done(void)
   3253{
   3254    int idx, thread_count;
   3255
   3256    if (!migrate_use_compression()) {
   3257        return 0;
   3258    }
   3259
   3260    thread_count = migrate_decompress_threads();
   3261    qemu_mutex_lock(&decomp_done_lock);
   3262    for (idx = 0; idx < thread_count; idx++) {
   3263        while (!decomp_param[idx].done) {
   3264            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
   3265        }
   3266    }
   3267    qemu_mutex_unlock(&decomp_done_lock);
   3268    return qemu_file_get_error(decomp_file);
   3269}
   3270
   3271static void compress_threads_load_cleanup(void)
   3272{
   3273    int i, thread_count;
   3274
   3275    if (!migrate_use_compression()) {
   3276        return;
   3277    }
   3278    thread_count = migrate_decompress_threads();
   3279    for (i = 0; i < thread_count; i++) {
   3280        /*
   3281         * we use it as a indicator which shows if the thread is
   3282         * properly init'd or not
   3283         */
   3284        if (!decomp_param[i].compbuf) {
   3285            break;
   3286        }
   3287
   3288        qemu_mutex_lock(&decomp_param[i].mutex);
   3289        decomp_param[i].quit = true;
   3290        qemu_cond_signal(&decomp_param[i].cond);
   3291        qemu_mutex_unlock(&decomp_param[i].mutex);
   3292    }
   3293    for (i = 0; i < thread_count; i++) {
   3294        if (!decomp_param[i].compbuf) {
   3295            break;
   3296        }
   3297
   3298        qemu_thread_join(decompress_threads + i);
   3299        qemu_mutex_destroy(&decomp_param[i].mutex);
   3300        qemu_cond_destroy(&decomp_param[i].cond);
   3301        inflateEnd(&decomp_param[i].stream);
   3302        g_free(decomp_param[i].compbuf);
   3303        decomp_param[i].compbuf = NULL;
   3304    }
   3305    g_free(decompress_threads);
   3306    g_free(decomp_param);
   3307    decompress_threads = NULL;
   3308    decomp_param = NULL;
   3309    decomp_file = NULL;
   3310}
   3311
   3312static int compress_threads_load_setup(QEMUFile *f)
   3313{
   3314    int i, thread_count;
   3315
   3316    if (!migrate_use_compression()) {
   3317        return 0;
   3318    }
   3319
   3320    thread_count = migrate_decompress_threads();
   3321    decompress_threads = g_new0(QemuThread, thread_count);
   3322    decomp_param = g_new0(DecompressParam, thread_count);
   3323    qemu_mutex_init(&decomp_done_lock);
   3324    qemu_cond_init(&decomp_done_cond);
   3325    decomp_file = f;
   3326    for (i = 0; i < thread_count; i++) {
   3327        if (inflateInit(&decomp_param[i].stream) != Z_OK) {
   3328            goto exit;
   3329        }
   3330
   3331        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
   3332        qemu_mutex_init(&decomp_param[i].mutex);
   3333        qemu_cond_init(&decomp_param[i].cond);
   3334        decomp_param[i].done = true;
   3335        decomp_param[i].quit = false;
   3336        qemu_thread_create(decompress_threads + i, "decompress",
   3337                           do_data_decompress, decomp_param + i,
   3338                           QEMU_THREAD_JOINABLE);
   3339    }
   3340    return 0;
   3341exit:
   3342    compress_threads_load_cleanup();
   3343    return -1;
   3344}
   3345
   3346static void decompress_data_with_multi_threads(QEMUFile *f,
   3347                                               void *host, int len)
   3348{
   3349    int idx, thread_count;
   3350
   3351    thread_count = migrate_decompress_threads();
   3352    QEMU_LOCK_GUARD(&decomp_done_lock);
   3353    while (true) {
   3354        for (idx = 0; idx < thread_count; idx++) {
   3355            if (decomp_param[idx].done) {
   3356                decomp_param[idx].done = false;
   3357                qemu_mutex_lock(&decomp_param[idx].mutex);
   3358                qemu_get_buffer(f, decomp_param[idx].compbuf, len);
   3359                decomp_param[idx].des = host;
   3360                decomp_param[idx].len = len;
   3361                qemu_cond_signal(&decomp_param[idx].cond);
   3362                qemu_mutex_unlock(&decomp_param[idx].mutex);
   3363                break;
   3364            }
   3365        }
   3366        if (idx < thread_count) {
   3367            break;
   3368        } else {
   3369            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
   3370        }
   3371    }
   3372}
   3373
   3374static void colo_init_ram_state(void)
   3375{
   3376    ram_state_init(&ram_state);
   3377}
   3378
   3379/*
   3380 * colo cache: this is for secondary VM, we cache the whole
   3381 * memory of the secondary VM, it is need to hold the global lock
   3382 * to call this helper.
   3383 */
   3384int colo_init_ram_cache(void)
   3385{
   3386    RAMBlock *block;
   3387
   3388    WITH_RCU_READ_LOCK_GUARD() {
   3389        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   3390            block->colo_cache = qemu_anon_ram_alloc(block->used_length,
   3391                                                    NULL, false, false);
   3392            if (!block->colo_cache) {
   3393                error_report("%s: Can't alloc memory for COLO cache of block %s,"
   3394                             "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
   3395                             block->used_length);
   3396                RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   3397                    if (block->colo_cache) {
   3398                        qemu_anon_ram_free(block->colo_cache, block->used_length);
   3399                        block->colo_cache = NULL;
   3400                    }
   3401                }
   3402                return -errno;
   3403            }
   3404        }
   3405    }
   3406
   3407    /*
   3408    * Record the dirty pages that sent by PVM, we use this dirty bitmap together
   3409    * with to decide which page in cache should be flushed into SVM's RAM. Here
   3410    * we use the same name 'ram_bitmap' as for migration.
   3411    */
   3412    if (ram_bytes_total()) {
   3413        RAMBlock *block;
   3414
   3415        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   3416            unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
   3417            block->bmap = bitmap_new(pages);
   3418        }
   3419    }
   3420
   3421    colo_init_ram_state();
   3422    return 0;
   3423}
   3424
   3425/* TODO: duplicated with ram_init_bitmaps */
   3426void colo_incoming_start_dirty_log(void)
   3427{
   3428    RAMBlock *block = NULL;
   3429    /* For memory_global_dirty_log_start below. */
   3430    qemu_mutex_lock_iothread();
   3431    qemu_mutex_lock_ramlist();
   3432
   3433    memory_global_dirty_log_sync();
   3434    WITH_RCU_READ_LOCK_GUARD() {
   3435        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   3436            ramblock_sync_dirty_bitmap(ram_state, block);
   3437            /* Discard this dirty bitmap record */
   3438            bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
   3439        }
   3440        memory_global_dirty_log_start();
   3441    }
   3442    ram_state->migration_dirty_pages = 0;
   3443    qemu_mutex_unlock_ramlist();
   3444    qemu_mutex_unlock_iothread();
   3445}
   3446
   3447/* It is need to hold the global lock to call this helper */
   3448void colo_release_ram_cache(void)
   3449{
   3450    RAMBlock *block;
   3451
   3452    memory_global_dirty_log_stop();
   3453    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   3454        g_free(block->bmap);
   3455        block->bmap = NULL;
   3456    }
   3457
   3458    WITH_RCU_READ_LOCK_GUARD() {
   3459        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   3460            if (block->colo_cache) {
   3461                qemu_anon_ram_free(block->colo_cache, block->used_length);
   3462                block->colo_cache = NULL;
   3463            }
   3464        }
   3465    }
   3466    ram_state_cleanup(&ram_state);
   3467}
   3468
   3469/**
   3470 * ram_load_setup: Setup RAM for migration incoming side
   3471 *
   3472 * Returns zero to indicate success and negative for error
   3473 *
   3474 * @f: QEMUFile where to receive the data
   3475 * @opaque: RAMState pointer
   3476 */
   3477static int ram_load_setup(QEMUFile *f, void *opaque)
   3478{
   3479    if (compress_threads_load_setup(f)) {
   3480        return -1;
   3481    }
   3482
   3483    xbzrle_load_setup();
   3484    ramblock_recv_map_init();
   3485
   3486    return 0;
   3487}
   3488
   3489static int ram_load_cleanup(void *opaque)
   3490{
   3491    RAMBlock *rb;
   3492
   3493    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
   3494        qemu_ram_block_writeback(rb);
   3495    }
   3496
   3497    xbzrle_load_cleanup();
   3498    compress_threads_load_cleanup();
   3499
   3500    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
   3501        g_free(rb->receivedmap);
   3502        rb->receivedmap = NULL;
   3503    }
   3504
   3505    return 0;
   3506}
   3507
   3508/**
   3509 * ram_postcopy_incoming_init: allocate postcopy data structures
   3510 *
   3511 * Returns 0 for success and negative if there was one error
   3512 *
   3513 * @mis: current migration incoming state
   3514 *
   3515 * Allocate data structures etc needed by incoming migration with
   3516 * postcopy-ram. postcopy-ram's similarly names
   3517 * postcopy_ram_incoming_init does the work.
   3518 */
   3519int ram_postcopy_incoming_init(MigrationIncomingState *mis)
   3520{
   3521    return postcopy_ram_incoming_init(mis);
   3522}
   3523
   3524/**
   3525 * ram_load_postcopy: load a page in postcopy case
   3526 *
   3527 * Returns 0 for success or -errno in case of error
   3528 *
   3529 * Called in postcopy mode by ram_load().
   3530 * rcu_read_lock is taken prior to this being called.
   3531 *
   3532 * @f: QEMUFile where to send the data
   3533 */
   3534static int ram_load_postcopy(QEMUFile *f)
   3535{
   3536    int flags = 0, ret = 0;
   3537    bool place_needed = false;
   3538    bool matches_target_page_size = false;
   3539    MigrationIncomingState *mis = migration_incoming_get_current();
   3540    /* Temporary page that is later 'placed' */
   3541    void *postcopy_host_page = mis->postcopy_tmp_page;
   3542    void *host_page = NULL;
   3543    bool all_zero = true;
   3544    int target_pages = 0;
   3545
   3546    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
   3547        ram_addr_t addr;
   3548        void *page_buffer = NULL;
   3549        void *place_source = NULL;
   3550        RAMBlock *block = NULL;
   3551        uint8_t ch;
   3552        int len;
   3553
   3554        addr = qemu_get_be64(f);
   3555
   3556        /*
   3557         * If qemu file error, we should stop here, and then "addr"
   3558         * may be invalid
   3559         */
   3560        ret = qemu_file_get_error(f);
   3561        if (ret) {
   3562            break;
   3563        }
   3564
   3565        flags = addr & ~TARGET_PAGE_MASK;
   3566        addr &= TARGET_PAGE_MASK;
   3567
   3568        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
   3569        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
   3570                     RAM_SAVE_FLAG_COMPRESS_PAGE)) {
   3571            block = ram_block_from_stream(f, flags);
   3572            if (!block) {
   3573                ret = -EINVAL;
   3574                break;
   3575            }
   3576
   3577            /*
   3578             * Relying on used_length is racy and can result in false positives.
   3579             * We might place pages beyond used_length in case RAM was shrunk
   3580             * while in postcopy, which is fine - trying to place via
   3581             * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
   3582             */
   3583            if (!block->host || addr >= block->postcopy_length) {
   3584                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
   3585                ret = -EINVAL;
   3586                break;
   3587            }
   3588            target_pages++;
   3589            matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
   3590            /*
   3591             * Postcopy requires that we place whole host pages atomically;
   3592             * these may be huge pages for RAMBlocks that are backed by
   3593             * hugetlbfs.
   3594             * To make it atomic, the data is read into a temporary page
   3595             * that's moved into place later.
   3596             * The migration protocol uses,  possibly smaller, target-pages
   3597             * however the source ensures it always sends all the components
   3598             * of a host page in one chunk.
   3599             */
   3600            page_buffer = postcopy_host_page +
   3601                          host_page_offset_from_ram_block_offset(block, addr);
   3602            /* If all TP are zero then we can optimise the place */
   3603            if (target_pages == 1) {
   3604                host_page = host_page_from_ram_block_offset(block, addr);
   3605            } else if (host_page != host_page_from_ram_block_offset(block,
   3606                                                                    addr)) {
   3607                /* not the 1st TP within the HP */
   3608                error_report("Non-same host page %p/%p", host_page,
   3609                             host_page_from_ram_block_offset(block, addr));
   3610                ret = -EINVAL;
   3611                break;
   3612            }
   3613
   3614            /*
   3615             * If it's the last part of a host page then we place the host
   3616             * page
   3617             */
   3618            if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
   3619                place_needed = true;
   3620            }
   3621            place_source = postcopy_host_page;
   3622        }
   3623
   3624        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
   3625        case RAM_SAVE_FLAG_ZERO:
   3626            ch = qemu_get_byte(f);
   3627            /*
   3628             * Can skip to set page_buffer when
   3629             * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
   3630             */
   3631            if (ch || !matches_target_page_size) {
   3632                memset(page_buffer, ch, TARGET_PAGE_SIZE);
   3633            }
   3634            if (ch) {
   3635                all_zero = false;
   3636            }
   3637            break;
   3638
   3639        case RAM_SAVE_FLAG_PAGE:
   3640            all_zero = false;
   3641            if (!matches_target_page_size) {
   3642                /* For huge pages, we always use temporary buffer */
   3643                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
   3644            } else {
   3645                /*
   3646                 * For small pages that matches target page size, we
   3647                 * avoid the qemu_file copy.  Instead we directly use
   3648                 * the buffer of QEMUFile to place the page.  Note: we
   3649                 * cannot do any QEMUFile operation before using that
   3650                 * buffer to make sure the buffer is valid when
   3651                 * placing the page.
   3652                 */
   3653                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
   3654                                         TARGET_PAGE_SIZE);
   3655            }
   3656            break;
   3657        case RAM_SAVE_FLAG_COMPRESS_PAGE:
   3658            all_zero = false;
   3659            len = qemu_get_be32(f);
   3660            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
   3661                error_report("Invalid compressed data length: %d", len);
   3662                ret = -EINVAL;
   3663                break;
   3664            }
   3665            decompress_data_with_multi_threads(f, page_buffer, len);
   3666            break;
   3667
   3668        case RAM_SAVE_FLAG_EOS:
   3669            /* normal exit */
   3670            multifd_recv_sync_main();
   3671            break;
   3672        default:
   3673            error_report("Unknown combination of migration flags: 0x%x"
   3674                         " (postcopy mode)", flags);
   3675            ret = -EINVAL;
   3676            break;
   3677        }
   3678
   3679        /* Got the whole host page, wait for decompress before placing. */
   3680        if (place_needed) {
   3681            ret |= wait_for_decompress_done();
   3682        }
   3683
   3684        /* Detect for any possible file errors */
   3685        if (!ret && qemu_file_get_error(f)) {
   3686            ret = qemu_file_get_error(f);
   3687        }
   3688
   3689        if (!ret && place_needed) {
   3690            if (all_zero) {
   3691                ret = postcopy_place_page_zero(mis, host_page, block);
   3692            } else {
   3693                ret = postcopy_place_page(mis, host_page, place_source,
   3694                                          block);
   3695            }
   3696            place_needed = false;
   3697            target_pages = 0;
   3698            /* Assume we have a zero page until we detect something different */
   3699            all_zero = true;
   3700        }
   3701    }
   3702
   3703    return ret;
   3704}
   3705
   3706static bool postcopy_is_advised(void)
   3707{
   3708    PostcopyState ps = postcopy_state_get();
   3709    return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
   3710}
   3711
   3712static bool postcopy_is_running(void)
   3713{
   3714    PostcopyState ps = postcopy_state_get();
   3715    return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
   3716}
   3717
   3718/*
   3719 * Flush content of RAM cache into SVM's memory.
   3720 * Only flush the pages that be dirtied by PVM or SVM or both.
   3721 */
   3722void colo_flush_ram_cache(void)
   3723{
   3724    RAMBlock *block = NULL;
   3725    void *dst_host;
   3726    void *src_host;
   3727    unsigned long offset = 0;
   3728
   3729    memory_global_dirty_log_sync();
   3730    qemu_mutex_lock(&ram_state->bitmap_mutex);
   3731    WITH_RCU_READ_LOCK_GUARD() {
   3732        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   3733            ramblock_sync_dirty_bitmap(ram_state, block);
   3734        }
   3735    }
   3736
   3737    trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
   3738    WITH_RCU_READ_LOCK_GUARD() {
   3739        block = QLIST_FIRST_RCU(&ram_list.blocks);
   3740
   3741        while (block) {
   3742            offset = migration_bitmap_find_dirty(ram_state, block, offset);
   3743
   3744            if (!offset_in_ramblock(block,
   3745                                    ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
   3746                offset = 0;
   3747                block = QLIST_NEXT_RCU(block, next);
   3748            } else {
   3749                migration_bitmap_clear_dirty(ram_state, block, offset);
   3750                dst_host = block->host
   3751                         + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
   3752                src_host = block->colo_cache
   3753                         + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
   3754                memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
   3755            }
   3756        }
   3757    }
   3758    trace_colo_flush_ram_cache_end();
   3759    qemu_mutex_unlock(&ram_state->bitmap_mutex);
   3760}
   3761
   3762/**
   3763 * ram_load_precopy: load pages in precopy case
   3764 *
   3765 * Returns 0 for success or -errno in case of error
   3766 *
   3767 * Called in precopy mode by ram_load().
   3768 * rcu_read_lock is taken prior to this being called.
   3769 *
   3770 * @f: QEMUFile where to send the data
   3771 */
   3772static int ram_load_precopy(QEMUFile *f)
   3773{
   3774    int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
   3775    /* ADVISE is earlier, it shows the source has the postcopy capability on */
   3776    bool postcopy_advised = postcopy_is_advised();
   3777    if (!migrate_use_compression()) {
   3778        invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
   3779    }
   3780
   3781    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
   3782        ram_addr_t addr, total_ram_bytes;
   3783        void *host = NULL, *host_bak = NULL;
   3784        uint8_t ch;
   3785
   3786        /*
   3787         * Yield periodically to let main loop run, but an iteration of
   3788         * the main loop is expensive, so do it each some iterations
   3789         */
   3790        if ((i & 32767) == 0 && qemu_in_coroutine()) {
   3791            aio_co_schedule(qemu_get_current_aio_context(),
   3792                            qemu_coroutine_self());
   3793            qemu_coroutine_yield();
   3794        }
   3795        i++;
   3796
   3797        addr = qemu_get_be64(f);
   3798        flags = addr & ~TARGET_PAGE_MASK;
   3799        addr &= TARGET_PAGE_MASK;
   3800
   3801        if (flags & invalid_flags) {
   3802            if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
   3803                error_report("Received an unexpected compressed page");
   3804            }
   3805
   3806            ret = -EINVAL;
   3807            break;
   3808        }
   3809
   3810        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
   3811                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
   3812            RAMBlock *block = ram_block_from_stream(f, flags);
   3813
   3814            host = host_from_ram_block_offset(block, addr);
   3815            /*
   3816             * After going into COLO stage, we should not load the page
   3817             * into SVM's memory directly, we put them into colo_cache firstly.
   3818             * NOTE: We need to keep a copy of SVM's ram in colo_cache.
   3819             * Previously, we copied all these memory in preparing stage of COLO
   3820             * while we need to stop VM, which is a time-consuming process.
   3821             * Here we optimize it by a trick, back-up every page while in
   3822             * migration process while COLO is enabled, though it affects the
   3823             * speed of the migration, but it obviously reduce the downtime of
   3824             * back-up all SVM'S memory in COLO preparing stage.
   3825             */
   3826            if (migration_incoming_colo_enabled()) {
   3827                if (migration_incoming_in_colo_state()) {
   3828                    /* In COLO stage, put all pages into cache temporarily */
   3829                    host = colo_cache_from_block_offset(block, addr, true);
   3830                } else {
   3831                   /*
   3832                    * In migration stage but before COLO stage,
   3833                    * Put all pages into both cache and SVM's memory.
   3834                    */
   3835                    host_bak = colo_cache_from_block_offset(block, addr, false);
   3836                }
   3837            }
   3838            if (!host) {
   3839                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
   3840                ret = -EINVAL;
   3841                break;
   3842            }
   3843            if (!migration_incoming_in_colo_state()) {
   3844                ramblock_recv_bitmap_set(block, host);
   3845            }
   3846
   3847            trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
   3848        }
   3849
   3850        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
   3851        case RAM_SAVE_FLAG_MEM_SIZE:
   3852            /* Synchronize RAM block list */
   3853            total_ram_bytes = addr;
   3854            while (!ret && total_ram_bytes) {
   3855                RAMBlock *block;
   3856                char id[256];
   3857                ram_addr_t length;
   3858
   3859                len = qemu_get_byte(f);
   3860                qemu_get_buffer(f, (uint8_t *)id, len);
   3861                id[len] = 0;
   3862                length = qemu_get_be64(f);
   3863
   3864                block = qemu_ram_block_by_name(id);
   3865                if (block && !qemu_ram_is_migratable(block)) {
   3866                    error_report("block %s should not be migrated !", id);
   3867                    ret = -EINVAL;
   3868                } else if (block) {
   3869                    if (length != block->used_length) {
   3870                        Error *local_err = NULL;
   3871
   3872                        ret = qemu_ram_resize(block, length,
   3873                                              &local_err);
   3874                        if (local_err) {
   3875                            error_report_err(local_err);
   3876                        }
   3877                    }
   3878                    /* For postcopy we need to check hugepage sizes match */
   3879                    if (postcopy_advised && migrate_postcopy_ram() &&
   3880                        block->page_size != qemu_host_page_size) {
   3881                        uint64_t remote_page_size = qemu_get_be64(f);
   3882                        if (remote_page_size != block->page_size) {
   3883                            error_report("Mismatched RAM page size %s "
   3884                                         "(local) %zd != %" PRId64,
   3885                                         id, block->page_size,
   3886                                         remote_page_size);
   3887                            ret = -EINVAL;
   3888                        }
   3889                    }
   3890                    if (migrate_ignore_shared()) {
   3891                        hwaddr addr = qemu_get_be64(f);
   3892                        if (ramblock_is_ignored(block) &&
   3893                            block->mr->addr != addr) {
   3894                            error_report("Mismatched GPAs for block %s "
   3895                                         "%" PRId64 "!= %" PRId64,
   3896                                         id, (uint64_t)addr,
   3897                                         (uint64_t)block->mr->addr);
   3898                            ret = -EINVAL;
   3899                        }
   3900                    }
   3901                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
   3902                                          block->idstr);
   3903                } else {
   3904                    error_report("Unknown ramblock \"%s\", cannot "
   3905                                 "accept migration", id);
   3906                    ret = -EINVAL;
   3907                }
   3908
   3909                total_ram_bytes -= length;
   3910            }
   3911            break;
   3912
   3913        case RAM_SAVE_FLAG_ZERO:
   3914            ch = qemu_get_byte(f);
   3915            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
   3916            break;
   3917
   3918        case RAM_SAVE_FLAG_PAGE:
   3919            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
   3920            break;
   3921
   3922        case RAM_SAVE_FLAG_COMPRESS_PAGE:
   3923            len = qemu_get_be32(f);
   3924            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
   3925                error_report("Invalid compressed data length: %d", len);
   3926                ret = -EINVAL;
   3927                break;
   3928            }
   3929            decompress_data_with_multi_threads(f, host, len);
   3930            break;
   3931
   3932        case RAM_SAVE_FLAG_XBZRLE:
   3933            if (load_xbzrle(f, addr, host) < 0) {
   3934                error_report("Failed to decompress XBZRLE page at "
   3935                             RAM_ADDR_FMT, addr);
   3936                ret = -EINVAL;
   3937                break;
   3938            }
   3939            break;
   3940        case RAM_SAVE_FLAG_EOS:
   3941            /* normal exit */
   3942            multifd_recv_sync_main();
   3943            break;
   3944        default:
   3945            if (flags & RAM_SAVE_FLAG_HOOK) {
   3946                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
   3947            } else {
   3948                error_report("Unknown combination of migration flags: 0x%x",
   3949                             flags);
   3950                ret = -EINVAL;
   3951            }
   3952        }
   3953        if (!ret) {
   3954            ret = qemu_file_get_error(f);
   3955        }
   3956        if (!ret && host_bak) {
   3957            memcpy(host_bak, host, TARGET_PAGE_SIZE);
   3958        }
   3959    }
   3960
   3961    ret |= wait_for_decompress_done();
   3962    return ret;
   3963}
   3964
   3965static int ram_load(QEMUFile *f, void *opaque, int version_id)
   3966{
   3967    int ret = 0;
   3968    static uint64_t seq_iter;
   3969    /*
   3970     * If system is running in postcopy mode, page inserts to host memory must
   3971     * be atomic
   3972     */
   3973    bool postcopy_running = postcopy_is_running();
   3974
   3975    seq_iter++;
   3976
   3977    if (version_id != 4) {
   3978        return -EINVAL;
   3979    }
   3980
   3981    /*
   3982     * This RCU critical section can be very long running.
   3983     * When RCU reclaims in the code start to become numerous,
   3984     * it will be necessary to reduce the granularity of this
   3985     * critical section.
   3986     */
   3987    WITH_RCU_READ_LOCK_GUARD() {
   3988        if (postcopy_running) {
   3989            ret = ram_load_postcopy(f);
   3990        } else {
   3991            ret = ram_load_precopy(f);
   3992        }
   3993    }
   3994    trace_ram_load_complete(ret, seq_iter);
   3995
   3996    return ret;
   3997}
   3998
   3999static bool ram_has_postcopy(void *opaque)
   4000{
   4001    RAMBlock *rb;
   4002    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
   4003        if (ramblock_is_pmem(rb)) {
   4004            info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
   4005                         "is not supported now!", rb->idstr, rb->host);
   4006            return false;
   4007        }
   4008    }
   4009
   4010    return migrate_postcopy_ram();
   4011}
   4012
   4013/* Sync all the dirty bitmap with destination VM.  */
   4014static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
   4015{
   4016    RAMBlock *block;
   4017    QEMUFile *file = s->to_dst_file;
   4018    int ramblock_count = 0;
   4019
   4020    trace_ram_dirty_bitmap_sync_start();
   4021
   4022    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   4023        qemu_savevm_send_recv_bitmap(file, block->idstr);
   4024        trace_ram_dirty_bitmap_request(block->idstr);
   4025        ramblock_count++;
   4026    }
   4027
   4028    trace_ram_dirty_bitmap_sync_wait();
   4029
   4030    /* Wait until all the ramblocks' dirty bitmap synced */
   4031    while (ramblock_count--) {
   4032        qemu_sem_wait(&s->rp_state.rp_sem);
   4033    }
   4034
   4035    trace_ram_dirty_bitmap_sync_complete();
   4036
   4037    return 0;
   4038}
   4039
   4040static void ram_dirty_bitmap_reload_notify(MigrationState *s)
   4041{
   4042    qemu_sem_post(&s->rp_state.rp_sem);
   4043}
   4044
   4045/*
   4046 * Read the received bitmap, revert it as the initial dirty bitmap.
   4047 * This is only used when the postcopy migration is paused but wants
   4048 * to resume from a middle point.
   4049 */
   4050int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
   4051{
   4052    int ret = -EINVAL;
   4053    /* from_dst_file is always valid because we're within rp_thread */
   4054    QEMUFile *file = s->rp_state.from_dst_file;
   4055    unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
   4056    uint64_t local_size = DIV_ROUND_UP(nbits, 8);
   4057    uint64_t size, end_mark;
   4058
   4059    trace_ram_dirty_bitmap_reload_begin(block->idstr);
   4060
   4061    if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
   4062        error_report("%s: incorrect state %s", __func__,
   4063                     MigrationStatus_str(s->state));
   4064        return -EINVAL;
   4065    }
   4066
   4067    /*
   4068     * Note: see comments in ramblock_recv_bitmap_send() on why we
   4069     * need the endianness conversion, and the paddings.
   4070     */
   4071    local_size = ROUND_UP(local_size, 8);
   4072
   4073    /* Add paddings */
   4074    le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
   4075
   4076    size = qemu_get_be64(file);
   4077
   4078    /* The size of the bitmap should match with our ramblock */
   4079    if (size != local_size) {
   4080        error_report("%s: ramblock '%s' bitmap size mismatch "
   4081                     "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
   4082                     block->idstr, size, local_size);
   4083        ret = -EINVAL;
   4084        goto out;
   4085    }
   4086
   4087    size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
   4088    end_mark = qemu_get_be64(file);
   4089
   4090    ret = qemu_file_get_error(file);
   4091    if (ret || size != local_size) {
   4092        error_report("%s: read bitmap failed for ramblock '%s': %d"
   4093                     " (size 0x%"PRIx64", got: 0x%"PRIx64")",
   4094                     __func__, block->idstr, ret, local_size, size);
   4095        ret = -EIO;
   4096        goto out;
   4097    }
   4098
   4099    if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
   4100        error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
   4101                     __func__, block->idstr, end_mark);
   4102        ret = -EINVAL;
   4103        goto out;
   4104    }
   4105
   4106    /*
   4107     * Endianness conversion. We are during postcopy (though paused).
   4108     * The dirty bitmap won't change. We can directly modify it.
   4109     */
   4110    bitmap_from_le(block->bmap, le_bitmap, nbits);
   4111
   4112    /*
   4113     * What we received is "received bitmap". Revert it as the initial
   4114     * dirty bitmap for this ramblock.
   4115     */
   4116    bitmap_complement(block->bmap, block->bmap, nbits);
   4117
   4118    trace_ram_dirty_bitmap_reload_complete(block->idstr);
   4119
   4120    /*
   4121     * We succeeded to sync bitmap for current ramblock. If this is
   4122     * the last one to sync, we need to notify the main send thread.
   4123     */
   4124    ram_dirty_bitmap_reload_notify(s);
   4125
   4126    ret = 0;
   4127out:
   4128    g_free(le_bitmap);
   4129    return ret;
   4130}
   4131
   4132static int ram_resume_prepare(MigrationState *s, void *opaque)
   4133{
   4134    RAMState *rs = *(RAMState **)opaque;
   4135    int ret;
   4136
   4137    ret = ram_dirty_bitmap_sync_all(s, rs);
   4138    if (ret) {
   4139        return ret;
   4140    }
   4141
   4142    ram_state_resume_prepare(rs, s->to_dst_file);
   4143
   4144    return 0;
   4145}
   4146
   4147static SaveVMHandlers savevm_ram_handlers = {
   4148    .save_setup = ram_save_setup,
   4149    .save_live_iterate = ram_save_iterate,
   4150    .save_live_complete_postcopy = ram_save_complete,
   4151    .save_live_complete_precopy = ram_save_complete,
   4152    .has_postcopy = ram_has_postcopy,
   4153    .save_live_pending = ram_save_pending,
   4154    .load_state = ram_load,
   4155    .save_cleanup = ram_save_cleanup,
   4156    .load_setup = ram_load_setup,
   4157    .load_cleanup = ram_load_cleanup,
   4158    .resume_prepare = ram_resume_prepare,
   4159};
   4160
   4161static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
   4162                                      size_t old_size, size_t new_size)
   4163{
   4164    PostcopyState ps = postcopy_state_get();
   4165    ram_addr_t offset;
   4166    RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
   4167    Error *err = NULL;
   4168
   4169    if (ramblock_is_ignored(rb)) {
   4170        return;
   4171    }
   4172
   4173    if (!migration_is_idle()) {
   4174        /*
   4175         * Precopy code on the source cannot deal with the size of RAM blocks
   4176         * changing at random points in time - especially after sending the
   4177         * RAM block sizes in the migration stream, they must no longer change.
   4178         * Abort and indicate a proper reason.
   4179         */
   4180        error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
   4181        migrate_set_error(migrate_get_current(), err);
   4182        error_free(err);
   4183        migration_cancel();
   4184    }
   4185
   4186    switch (ps) {
   4187    case POSTCOPY_INCOMING_ADVISE:
   4188        /*
   4189         * Update what ram_postcopy_incoming_init()->init_range() does at the
   4190         * time postcopy was advised. Syncing RAM blocks with the source will
   4191         * result in RAM resizes.
   4192         */
   4193        if (old_size < new_size) {
   4194            if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
   4195                error_report("RAM block '%s' discard of resized RAM failed",
   4196                             rb->idstr);
   4197            }
   4198        }
   4199        rb->postcopy_length = new_size;
   4200        break;
   4201    case POSTCOPY_INCOMING_NONE:
   4202    case POSTCOPY_INCOMING_RUNNING:
   4203    case POSTCOPY_INCOMING_END:
   4204        /*
   4205         * Once our guest is running, postcopy does no longer care about
   4206         * resizes. When growing, the new memory was not available on the
   4207         * source, no handler needed.
   4208         */
   4209        break;
   4210    default:
   4211        error_report("RAM block '%s' resized during postcopy state: %d",
   4212                     rb->idstr, ps);
   4213        exit(-1);
   4214    }
   4215}
   4216
   4217static RAMBlockNotifier ram_mig_ram_notifier = {
   4218    .ram_block_resized = ram_mig_ram_block_resized,
   4219};
   4220
   4221void ram_mig_init(void)
   4222{
   4223    qemu_mutex_init(&XBZRLE.lock);
   4224    register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
   4225    ram_block_notifier_add(&ram_mig_ram_notifier);
   4226}