cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

postcopy-ram.c (46662B)


      1/*
      2 * Postcopy migration for RAM
      3 *
      4 * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
      5 *
      6 * Authors:
      7 *  Dave Gilbert  <dgilbert@redhat.com>
      8 *
      9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
     10 * See the COPYING file in the top-level directory.
     11 *
     12 */
     13
     14/*
     15 * Postcopy is a migration technique where the execution flips from the
     16 * source to the destination before all the data has been copied.
     17 */
     18
     19#include "qemu/osdep.h"
     20#include "qemu/rcu.h"
     21#include "exec/target_page.h"
     22#include "migration.h"
     23#include "qemu-file.h"
     24#include "savevm.h"
     25#include "postcopy-ram.h"
     26#include "ram.h"
     27#include "qapi/error.h"
     28#include "qemu/notify.h"
     29#include "qemu/rcu.h"
     30#include "sysemu/sysemu.h"
     31#include "qemu/error-report.h"
     32#include "trace.h"
     33#include "hw/boards.h"
     34#include "exec/ramblock.h"
     35
     36/* Arbitrary limit on size of each discard command,
     37 * keeps them around ~200 bytes
     38 */
     39#define MAX_DISCARDS_PER_COMMAND 12
     40
     41struct PostcopyDiscardState {
     42    const char *ramblock_name;
     43    uint16_t cur_entry;
     44    /*
     45     * Start and length of a discard range (bytes)
     46     */
     47    uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
     48    uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
     49    unsigned int nsentwords;
     50    unsigned int nsentcmds;
     51};
     52
     53static NotifierWithReturnList postcopy_notifier_list;
     54
     55void postcopy_infrastructure_init(void)
     56{
     57    notifier_with_return_list_init(&postcopy_notifier_list);
     58}
     59
     60void postcopy_add_notifier(NotifierWithReturn *nn)
     61{
     62    notifier_with_return_list_add(&postcopy_notifier_list, nn);
     63}
     64
     65void postcopy_remove_notifier(NotifierWithReturn *n)
     66{
     67    notifier_with_return_remove(n);
     68}
     69
     70int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
     71{
     72    struct PostcopyNotifyData pnd;
     73    pnd.reason = reason;
     74    pnd.errp = errp;
     75
     76    return notifier_with_return_list_notify(&postcopy_notifier_list,
     77                                            &pnd);
     78}
     79
     80/* Postcopy needs to detect accesses to pages that haven't yet been copied
     81 * across, and efficiently map new pages in, the techniques for doing this
     82 * are target OS specific.
     83 */
     84#if defined(__linux__)
     85
     86#include <poll.h>
     87#include <sys/ioctl.h>
     88#include <sys/syscall.h>
     89#include <asm/types.h> /* for __u64 */
     90#endif
     91
     92#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
     93#include <sys/eventfd.h>
     94#include <linux/userfaultfd.h>
     95
     96typedef struct PostcopyBlocktimeContext {
     97    /* time when page fault initiated per vCPU */
     98    uint32_t *page_fault_vcpu_time;
     99    /* page address per vCPU */
    100    uintptr_t *vcpu_addr;
    101    uint32_t total_blocktime;
    102    /* blocktime per vCPU */
    103    uint32_t *vcpu_blocktime;
    104    /* point in time when last page fault was initiated */
    105    uint32_t last_begin;
    106    /* number of vCPU are suspended */
    107    int smp_cpus_down;
    108    uint64_t start_time;
    109
    110    /*
    111     * Handler for exit event, necessary for
    112     * releasing whole blocktime_ctx
    113     */
    114    Notifier exit_notifier;
    115} PostcopyBlocktimeContext;
    116
    117static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
    118{
    119    g_free(ctx->page_fault_vcpu_time);
    120    g_free(ctx->vcpu_addr);
    121    g_free(ctx->vcpu_blocktime);
    122    g_free(ctx);
    123}
    124
    125static void migration_exit_cb(Notifier *n, void *data)
    126{
    127    PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
    128                                                 exit_notifier);
    129    destroy_blocktime_context(ctx);
    130}
    131
    132static struct PostcopyBlocktimeContext *blocktime_context_new(void)
    133{
    134    MachineState *ms = MACHINE(qdev_get_machine());
    135    unsigned int smp_cpus = ms->smp.cpus;
    136    PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
    137    ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
    138    ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
    139    ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
    140
    141    ctx->exit_notifier.notify = migration_exit_cb;
    142    ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
    143    qemu_add_exit_notifier(&ctx->exit_notifier);
    144    return ctx;
    145}
    146
    147static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
    148{
    149    MachineState *ms = MACHINE(qdev_get_machine());
    150    uint32List *list = NULL;
    151    int i;
    152
    153    for (i = ms->smp.cpus - 1; i >= 0; i--) {
    154        QAPI_LIST_PREPEND(list, ctx->vcpu_blocktime[i]);
    155    }
    156
    157    return list;
    158}
    159
    160/*
    161 * This function just populates MigrationInfo from postcopy's
    162 * blocktime context. It will not populate MigrationInfo,
    163 * unless postcopy-blocktime capability was set.
    164 *
    165 * @info: pointer to MigrationInfo to populate
    166 */
    167void fill_destination_postcopy_migration_info(MigrationInfo *info)
    168{
    169    MigrationIncomingState *mis = migration_incoming_get_current();
    170    PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
    171
    172    if (!bc) {
    173        return;
    174    }
    175
    176    info->has_postcopy_blocktime = true;
    177    info->postcopy_blocktime = bc->total_blocktime;
    178    info->has_postcopy_vcpu_blocktime = true;
    179    info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
    180}
    181
    182static uint32_t get_postcopy_total_blocktime(void)
    183{
    184    MigrationIncomingState *mis = migration_incoming_get_current();
    185    PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
    186
    187    if (!bc) {
    188        return 0;
    189    }
    190
    191    return bc->total_blocktime;
    192}
    193
    194/**
    195 * receive_ufd_features: check userfault fd features, to request only supported
    196 * features in the future.
    197 *
    198 * Returns: true on success
    199 *
    200 * __NR_userfaultfd - should be checked before
    201 *  @features: out parameter will contain uffdio_api.features provided by kernel
    202 *              in case of success
    203 */
    204static bool receive_ufd_features(uint64_t *features)
    205{
    206    struct uffdio_api api_struct = {0};
    207    int ufd;
    208    bool ret = true;
    209
    210    /* if we are here __NR_userfaultfd should exists */
    211    ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
    212    if (ufd == -1) {
    213        error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
    214                     strerror(errno));
    215        return false;
    216    }
    217
    218    /* ask features */
    219    api_struct.api = UFFD_API;
    220    api_struct.features = 0;
    221    if (ioctl(ufd, UFFDIO_API, &api_struct)) {
    222        error_report("%s: UFFDIO_API failed: %s", __func__,
    223                     strerror(errno));
    224        ret = false;
    225        goto release_ufd;
    226    }
    227
    228    *features = api_struct.features;
    229
    230release_ufd:
    231    close(ufd);
    232    return ret;
    233}
    234
    235/**
    236 * request_ufd_features: this function should be called only once on a newly
    237 * opened ufd, subsequent calls will lead to error.
    238 *
    239 * Returns: true on success
    240 *
    241 * @ufd: fd obtained from userfaultfd syscall
    242 * @features: bit mask see UFFD_API_FEATURES
    243 */
    244static bool request_ufd_features(int ufd, uint64_t features)
    245{
    246    struct uffdio_api api_struct = {0};
    247    uint64_t ioctl_mask;
    248
    249    api_struct.api = UFFD_API;
    250    api_struct.features = features;
    251    if (ioctl(ufd, UFFDIO_API, &api_struct)) {
    252        error_report("%s failed: UFFDIO_API failed: %s", __func__,
    253                     strerror(errno));
    254        return false;
    255    }
    256
    257    ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
    258                 (__u64)1 << _UFFDIO_UNREGISTER;
    259    if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
    260        error_report("Missing userfault features: %" PRIx64,
    261                     (uint64_t)(~api_struct.ioctls & ioctl_mask));
    262        return false;
    263    }
    264
    265    return true;
    266}
    267
    268static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
    269{
    270    uint64_t asked_features = 0;
    271    static uint64_t supported_features;
    272
    273    /*
    274     * it's not possible to
    275     * request UFFD_API twice per one fd
    276     * userfault fd features is persistent
    277     */
    278    if (!supported_features) {
    279        if (!receive_ufd_features(&supported_features)) {
    280            error_report("%s failed", __func__);
    281            return false;
    282        }
    283    }
    284
    285#ifdef UFFD_FEATURE_THREAD_ID
    286    if (migrate_postcopy_blocktime() && mis &&
    287        UFFD_FEATURE_THREAD_ID & supported_features) {
    288        /* kernel supports that feature */
    289        /* don't create blocktime_context if it exists */
    290        if (!mis->blocktime_ctx) {
    291            mis->blocktime_ctx = blocktime_context_new();
    292        }
    293
    294        asked_features |= UFFD_FEATURE_THREAD_ID;
    295    }
    296#endif
    297
    298    /*
    299     * request features, even if asked_features is 0, due to
    300     * kernel expects UFFD_API before UFFDIO_REGISTER, per
    301     * userfault file descriptor
    302     */
    303    if (!request_ufd_features(ufd, asked_features)) {
    304        error_report("%s failed: features %" PRIu64, __func__,
    305                     asked_features);
    306        return false;
    307    }
    308
    309    if (qemu_real_host_page_size != ram_pagesize_summary()) {
    310        bool have_hp = false;
    311        /* We've got a huge page */
    312#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
    313        have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
    314#endif
    315        if (!have_hp) {
    316            error_report("Userfault on this host does not support huge pages");
    317            return false;
    318        }
    319    }
    320    return true;
    321}
    322
    323/* Callback from postcopy_ram_supported_by_host block iterator.
    324 */
    325static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque)
    326{
    327    const char *block_name = qemu_ram_get_idstr(rb);
    328    ram_addr_t length = qemu_ram_get_used_length(rb);
    329    size_t pagesize = qemu_ram_pagesize(rb);
    330
    331    if (length % pagesize) {
    332        error_report("Postcopy requires RAM blocks to be a page size multiple,"
    333                     " block %s is 0x" RAM_ADDR_FMT " bytes with a "
    334                     "page size of 0x%zx", block_name, length, pagesize);
    335        return 1;
    336    }
    337    return 0;
    338}
    339
    340/*
    341 * Note: This has the side effect of munlock'ing all of RAM, that's
    342 * normally fine since if the postcopy succeeds it gets turned back on at the
    343 * end.
    344 */
    345bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
    346{
    347    long pagesize = qemu_real_host_page_size;
    348    int ufd = -1;
    349    bool ret = false; /* Error unless we change it */
    350    void *testarea = NULL;
    351    struct uffdio_register reg_struct;
    352    struct uffdio_range range_struct;
    353    uint64_t feature_mask;
    354    Error *local_err = NULL;
    355
    356    if (qemu_target_page_size() > pagesize) {
    357        error_report("Target page size bigger than host page size");
    358        goto out;
    359    }
    360
    361    ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
    362    if (ufd == -1) {
    363        error_report("%s: userfaultfd not available: %s", __func__,
    364                     strerror(errno));
    365        goto out;
    366    }
    367
    368    /* Give devices a chance to object */
    369    if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
    370        error_report_err(local_err);
    371        goto out;
    372    }
    373
    374    /* Version and features check */
    375    if (!ufd_check_and_apply(ufd, mis)) {
    376        goto out;
    377    }
    378
    379    /* We don't support postcopy with shared RAM yet */
    380    if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) {
    381        goto out;
    382    }
    383
    384    /*
    385     * userfault and mlock don't go together; we'll put it back later if
    386     * it was enabled.
    387     */
    388    if (munlockall()) {
    389        error_report("%s: munlockall: %s", __func__,  strerror(errno));
    390        goto out;
    391    }
    392
    393    /*
    394     *  We need to check that the ops we need are supported on anon memory
    395     *  To do that we need to register a chunk and see the flags that
    396     *  are returned.
    397     */
    398    testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
    399                                    MAP_ANONYMOUS, -1, 0);
    400    if (testarea == MAP_FAILED) {
    401        error_report("%s: Failed to map test area: %s", __func__,
    402                     strerror(errno));
    403        goto out;
    404    }
    405    g_assert(((size_t)testarea & (pagesize - 1)) == 0);
    406
    407    reg_struct.range.start = (uintptr_t)testarea;
    408    reg_struct.range.len = pagesize;
    409    reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
    410
    411    if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
    412        error_report("%s userfault register: %s", __func__, strerror(errno));
    413        goto out;
    414    }
    415
    416    range_struct.start = (uintptr_t)testarea;
    417    range_struct.len = pagesize;
    418    if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
    419        error_report("%s userfault unregister: %s", __func__, strerror(errno));
    420        goto out;
    421    }
    422
    423    feature_mask = (__u64)1 << _UFFDIO_WAKE |
    424                   (__u64)1 << _UFFDIO_COPY |
    425                   (__u64)1 << _UFFDIO_ZEROPAGE;
    426    if ((reg_struct.ioctls & feature_mask) != feature_mask) {
    427        error_report("Missing userfault map features: %" PRIx64,
    428                     (uint64_t)(~reg_struct.ioctls & feature_mask));
    429        goto out;
    430    }
    431
    432    /* Success! */
    433    ret = true;
    434out:
    435    if (testarea) {
    436        munmap(testarea, pagesize);
    437    }
    438    if (ufd != -1) {
    439        close(ufd);
    440    }
    441    return ret;
    442}
    443
    444/*
    445 * Setup an area of RAM so that it *can* be used for postcopy later; this
    446 * must be done right at the start prior to pre-copy.
    447 * opaque should be the MIS.
    448 */
    449static int init_range(RAMBlock *rb, void *opaque)
    450{
    451    const char *block_name = qemu_ram_get_idstr(rb);
    452    void *host_addr = qemu_ram_get_host_addr(rb);
    453    ram_addr_t offset = qemu_ram_get_offset(rb);
    454    ram_addr_t length = qemu_ram_get_used_length(rb);
    455    trace_postcopy_init_range(block_name, host_addr, offset, length);
    456
    457    /*
    458     * Save the used_length before running the guest. In case we have to
    459     * resize RAM blocks when syncing RAM block sizes from the source during
    460     * precopy, we'll update it manually via the ram block notifier.
    461     */
    462    rb->postcopy_length = length;
    463
    464    /*
    465     * We need the whole of RAM to be truly empty for postcopy, so things
    466     * like ROMs and any data tables built during init must be zero'd
    467     * - we're going to get the copy from the source anyway.
    468     * (Precopy will just overwrite this data, so doesn't need the discard)
    469     */
    470    if (ram_discard_range(block_name, 0, length)) {
    471        return -1;
    472    }
    473
    474    return 0;
    475}
    476
    477/*
    478 * At the end of migration, undo the effects of init_range
    479 * opaque should be the MIS.
    480 */
    481static int cleanup_range(RAMBlock *rb, void *opaque)
    482{
    483    const char *block_name = qemu_ram_get_idstr(rb);
    484    void *host_addr = qemu_ram_get_host_addr(rb);
    485    ram_addr_t offset = qemu_ram_get_offset(rb);
    486    ram_addr_t length = rb->postcopy_length;
    487    MigrationIncomingState *mis = opaque;
    488    struct uffdio_range range_struct;
    489    trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
    490
    491    /*
    492     * We turned off hugepage for the precopy stage with postcopy enabled
    493     * we can turn it back on now.
    494     */
    495    qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
    496
    497    /*
    498     * We can also turn off userfault now since we should have all the
    499     * pages.   It can be useful to leave it on to debug postcopy
    500     * if you're not sure it's always getting every page.
    501     */
    502    range_struct.start = (uintptr_t)host_addr;
    503    range_struct.len = length;
    504
    505    if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
    506        error_report("%s: userfault unregister %s", __func__, strerror(errno));
    507
    508        return -1;
    509    }
    510
    511    return 0;
    512}
    513
    514/*
    515 * Initialise postcopy-ram, setting the RAM to a state where we can go into
    516 * postcopy later; must be called prior to any precopy.
    517 * called from arch_init's similarly named ram_postcopy_incoming_init
    518 */
    519int postcopy_ram_incoming_init(MigrationIncomingState *mis)
    520{
    521    if (foreach_not_ignored_block(init_range, NULL)) {
    522        return -1;
    523    }
    524
    525    return 0;
    526}
    527
    528/*
    529 * At the end of a migration where postcopy_ram_incoming_init was called.
    530 */
    531int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
    532{
    533    trace_postcopy_ram_incoming_cleanup_entry();
    534
    535    if (mis->have_fault_thread) {
    536        Error *local_err = NULL;
    537
    538        /* Let the fault thread quit */
    539        qatomic_set(&mis->fault_thread_quit, 1);
    540        postcopy_fault_thread_notify(mis);
    541        trace_postcopy_ram_incoming_cleanup_join();
    542        qemu_thread_join(&mis->fault_thread);
    543
    544        if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
    545            error_report_err(local_err);
    546            return -1;
    547        }
    548
    549        if (foreach_not_ignored_block(cleanup_range, mis)) {
    550            return -1;
    551        }
    552
    553        trace_postcopy_ram_incoming_cleanup_closeuf();
    554        close(mis->userfault_fd);
    555        close(mis->userfault_event_fd);
    556        mis->have_fault_thread = false;
    557    }
    558
    559    if (enable_mlock) {
    560        if (os_mlock() < 0) {
    561            error_report("mlock: %s", strerror(errno));
    562            /*
    563             * It doesn't feel right to fail at this point, we have a valid
    564             * VM state.
    565             */
    566        }
    567    }
    568
    569    if (mis->postcopy_tmp_page) {
    570        munmap(mis->postcopy_tmp_page, mis->largest_page_size);
    571        mis->postcopy_tmp_page = NULL;
    572    }
    573    if (mis->postcopy_tmp_zero_page) {
    574        munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
    575        mis->postcopy_tmp_zero_page = NULL;
    576    }
    577    trace_postcopy_ram_incoming_cleanup_blocktime(
    578            get_postcopy_total_blocktime());
    579
    580    trace_postcopy_ram_incoming_cleanup_exit();
    581    return 0;
    582}
    583
    584/*
    585 * Disable huge pages on an area
    586 */
    587static int nhp_range(RAMBlock *rb, void *opaque)
    588{
    589    const char *block_name = qemu_ram_get_idstr(rb);
    590    void *host_addr = qemu_ram_get_host_addr(rb);
    591    ram_addr_t offset = qemu_ram_get_offset(rb);
    592    ram_addr_t length = rb->postcopy_length;
    593    trace_postcopy_nhp_range(block_name, host_addr, offset, length);
    594
    595    /*
    596     * Before we do discards we need to ensure those discards really
    597     * do delete areas of the page, even if THP thinks a hugepage would
    598     * be a good idea, so force hugepages off.
    599     */
    600    qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
    601
    602    return 0;
    603}
    604
    605/*
    606 * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
    607 * however leaving it until after precopy means that most of the precopy
    608 * data is still THPd
    609 */
    610int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
    611{
    612    if (foreach_not_ignored_block(nhp_range, mis)) {
    613        return -1;
    614    }
    615
    616    postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
    617
    618    return 0;
    619}
    620
    621/*
    622 * Mark the given area of RAM as requiring notification to unwritten areas
    623 * Used as a  callback on foreach_not_ignored_block.
    624 *   host_addr: Base of area to mark
    625 *   offset: Offset in the whole ram arena
    626 *   length: Length of the section
    627 *   opaque: MigrationIncomingState pointer
    628 * Returns 0 on success
    629 */
    630static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
    631{
    632    MigrationIncomingState *mis = opaque;
    633    struct uffdio_register reg_struct;
    634
    635    reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
    636    reg_struct.range.len = rb->postcopy_length;
    637    reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
    638
    639    /* Now tell our userfault_fd that it's responsible for this area */
    640    if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
    641        error_report("%s userfault register: %s", __func__, strerror(errno));
    642        return -1;
    643    }
    644    if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
    645        error_report("%s userfault: Region doesn't support COPY", __func__);
    646        return -1;
    647    }
    648    if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
    649        qemu_ram_set_uf_zeroable(rb);
    650    }
    651
    652    return 0;
    653}
    654
    655int postcopy_wake_shared(struct PostCopyFD *pcfd,
    656                         uint64_t client_addr,
    657                         RAMBlock *rb)
    658{
    659    size_t pagesize = qemu_ram_pagesize(rb);
    660    struct uffdio_range range;
    661    int ret;
    662    trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
    663    range.start = client_addr & ~(pagesize - 1);
    664    range.len = pagesize;
    665    ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
    666    if (ret) {
    667        error_report("%s: Failed to wake: %zx in %s (%s)",
    668                     __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
    669                     strerror(errno));
    670    }
    671    return ret;
    672}
    673
    674/*
    675 * Callback from shared fault handlers to ask for a page,
    676 * the page must be specified by a RAMBlock and an offset in that rb
    677 * Note: Only for use by shared fault handlers (in fault thread)
    678 */
    679int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
    680                                 uint64_t client_addr, uint64_t rb_offset)
    681{
    682    size_t pagesize = qemu_ram_pagesize(rb);
    683    uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
    684    MigrationIncomingState *mis = migration_incoming_get_current();
    685
    686    trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
    687                                       rb_offset);
    688    if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
    689        trace_postcopy_request_shared_page_present(pcfd->idstr,
    690                                        qemu_ram_get_idstr(rb), rb_offset);
    691        return postcopy_wake_shared(pcfd, client_addr, rb);
    692    }
    693    migrate_send_rp_req_pages(mis, rb, aligned_rbo, client_addr);
    694    return 0;
    695}
    696
    697static int get_mem_fault_cpu_index(uint32_t pid)
    698{
    699    CPUState *cpu_iter;
    700
    701    CPU_FOREACH(cpu_iter) {
    702        if (cpu_iter->thread_id == pid) {
    703            trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
    704            return cpu_iter->cpu_index;
    705        }
    706    }
    707    trace_get_mem_fault_cpu_index(-1, pid);
    708    return -1;
    709}
    710
    711static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
    712{
    713    int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
    714                                    dc->start_time;
    715    return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
    716}
    717
    718/*
    719 * This function is being called when pagefault occurs. It
    720 * tracks down vCPU blocking time.
    721 *
    722 * @addr: faulted host virtual address
    723 * @ptid: faulted process thread id
    724 * @rb: ramblock appropriate to addr
    725 */
    726static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
    727                                          RAMBlock *rb)
    728{
    729    int cpu, already_received;
    730    MigrationIncomingState *mis = migration_incoming_get_current();
    731    PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
    732    uint32_t low_time_offset;
    733
    734    if (!dc || ptid == 0) {
    735        return;
    736    }
    737    cpu = get_mem_fault_cpu_index(ptid);
    738    if (cpu < 0) {
    739        return;
    740    }
    741
    742    low_time_offset = get_low_time_offset(dc);
    743    if (dc->vcpu_addr[cpu] == 0) {
    744        qatomic_inc(&dc->smp_cpus_down);
    745    }
    746
    747    qatomic_xchg(&dc->last_begin, low_time_offset);
    748    qatomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
    749    qatomic_xchg(&dc->vcpu_addr[cpu], addr);
    750
    751    /*
    752     * check it here, not at the beginning of the function,
    753     * due to, check could occur early than bitmap_set in
    754     * qemu_ufd_copy_ioctl
    755     */
    756    already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
    757    if (already_received) {
    758        qatomic_xchg(&dc->vcpu_addr[cpu], 0);
    759        qatomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
    760        qatomic_dec(&dc->smp_cpus_down);
    761    }
    762    trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
    763                                        cpu, already_received);
    764}
    765
    766/*
    767 *  This function just provide calculated blocktime per cpu and trace it.
    768 *  Total blocktime is calculated in mark_postcopy_blocktime_end.
    769 *
    770 *
    771 * Assume we have 3 CPU
    772 *
    773 *      S1        E1           S1               E1
    774 * -----***********------------xxx***************------------------------> CPU1
    775 *
    776 *             S2                E2
    777 * ------------****************xxx---------------------------------------> CPU2
    778 *
    779 *                         S3            E3
    780 * ------------------------****xxx********-------------------------------> CPU3
    781 *
    782 * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
    783 * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
    784 * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 -
    785 *            it's a part of total blocktime.
    786 * S1 - here is last_begin
    787 * Legend of the picture is following:
    788 *              * - means blocktime per vCPU
    789 *              x - means overlapped blocktime (total blocktime)
    790 *
    791 * @addr: host virtual address
    792 */
    793static void mark_postcopy_blocktime_end(uintptr_t addr)
    794{
    795    MigrationIncomingState *mis = migration_incoming_get_current();
    796    PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
    797    MachineState *ms = MACHINE(qdev_get_machine());
    798    unsigned int smp_cpus = ms->smp.cpus;
    799    int i, affected_cpu = 0;
    800    bool vcpu_total_blocktime = false;
    801    uint32_t read_vcpu_time, low_time_offset;
    802
    803    if (!dc) {
    804        return;
    805    }
    806
    807    low_time_offset = get_low_time_offset(dc);
    808    /* lookup cpu, to clear it,
    809     * that algorithm looks straightforward, but it's not
    810     * optimal, more optimal algorithm is keeping tree or hash
    811     * where key is address value is a list of  */
    812    for (i = 0; i < smp_cpus; i++) {
    813        uint32_t vcpu_blocktime = 0;
    814
    815        read_vcpu_time = qatomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
    816        if (qatomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
    817            read_vcpu_time == 0) {
    818            continue;
    819        }
    820        qatomic_xchg(&dc->vcpu_addr[i], 0);
    821        vcpu_blocktime = low_time_offset - read_vcpu_time;
    822        affected_cpu += 1;
    823        /* we need to know is that mark_postcopy_end was due to
    824         * faulted page, another possible case it's prefetched
    825         * page and in that case we shouldn't be here */
    826        if (!vcpu_total_blocktime &&
    827            qatomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
    828            vcpu_total_blocktime = true;
    829        }
    830        /* continue cycle, due to one page could affect several vCPUs */
    831        dc->vcpu_blocktime[i] += vcpu_blocktime;
    832    }
    833
    834    qatomic_sub(&dc->smp_cpus_down, affected_cpu);
    835    if (vcpu_total_blocktime) {
    836        dc->total_blocktime += low_time_offset - qatomic_fetch_add(
    837                &dc->last_begin, 0);
    838    }
    839    trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
    840                                      affected_cpu);
    841}
    842
    843static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
    844{
    845    trace_postcopy_pause_fault_thread();
    846
    847    qemu_sem_wait(&mis->postcopy_pause_sem_fault);
    848
    849    trace_postcopy_pause_fault_thread_continued();
    850
    851    return true;
    852}
    853
    854/*
    855 * Handle faults detected by the USERFAULT markings
    856 */
    857static void *postcopy_ram_fault_thread(void *opaque)
    858{
    859    MigrationIncomingState *mis = opaque;
    860    struct uffd_msg msg;
    861    int ret;
    862    size_t index;
    863    RAMBlock *rb = NULL;
    864
    865    trace_postcopy_ram_fault_thread_entry();
    866    rcu_register_thread();
    867    mis->last_rb = NULL; /* last RAMBlock we sent part of */
    868    qemu_sem_post(&mis->fault_thread_sem);
    869
    870    struct pollfd *pfd;
    871    size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
    872
    873    pfd = g_new0(struct pollfd, pfd_len);
    874
    875    pfd[0].fd = mis->userfault_fd;
    876    pfd[0].events = POLLIN;
    877    pfd[1].fd = mis->userfault_event_fd;
    878    pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
    879    trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
    880    for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
    881        struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
    882                                                 struct PostCopyFD, index);
    883        pfd[2 + index].fd = pcfd->fd;
    884        pfd[2 + index].events = POLLIN;
    885        trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
    886                                                  pcfd->fd);
    887    }
    888
    889    while (true) {
    890        ram_addr_t rb_offset;
    891        int poll_result;
    892
    893        /*
    894         * We're mainly waiting for the kernel to give us a faulting HVA,
    895         * however we can be told to quit via userfault_quit_fd which is
    896         * an eventfd
    897         */
    898
    899        poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
    900        if (poll_result == -1) {
    901            error_report("%s: userfault poll: %s", __func__, strerror(errno));
    902            break;
    903        }
    904
    905        if (!mis->to_src_file) {
    906            /*
    907             * Possibly someone tells us that the return path is
    908             * broken already using the event. We should hold until
    909             * the channel is rebuilt.
    910             */
    911            if (postcopy_pause_fault_thread(mis)) {
    912                /* Continue to read the userfaultfd */
    913            } else {
    914                error_report("%s: paused but don't allow to continue",
    915                             __func__);
    916                break;
    917            }
    918        }
    919
    920        if (pfd[1].revents) {
    921            uint64_t tmp64 = 0;
    922
    923            /* Consume the signal */
    924            if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
    925                /* Nothing obviously nicer than posting this error. */
    926                error_report("%s: read() failed", __func__);
    927            }
    928
    929            if (qatomic_read(&mis->fault_thread_quit)) {
    930                trace_postcopy_ram_fault_thread_quit();
    931                break;
    932            }
    933        }
    934
    935        if (pfd[0].revents) {
    936            poll_result--;
    937            ret = read(mis->userfault_fd, &msg, sizeof(msg));
    938            if (ret != sizeof(msg)) {
    939                if (errno == EAGAIN) {
    940                    /*
    941                     * if a wake up happens on the other thread just after
    942                     * the poll, there is nothing to read.
    943                     */
    944                    continue;
    945                }
    946                if (ret < 0) {
    947                    error_report("%s: Failed to read full userfault "
    948                                 "message: %s",
    949                                 __func__, strerror(errno));
    950                    break;
    951                } else {
    952                    error_report("%s: Read %d bytes from userfaultfd "
    953                                 "expected %zd",
    954                                 __func__, ret, sizeof(msg));
    955                    break; /* Lost alignment, don't know what we'd read next */
    956                }
    957            }
    958            if (msg.event != UFFD_EVENT_PAGEFAULT) {
    959                error_report("%s: Read unexpected event %ud from userfaultfd",
    960                             __func__, msg.event);
    961                continue; /* It's not a page fault, shouldn't happen */
    962            }
    963
    964            rb = qemu_ram_block_from_host(
    965                     (void *)(uintptr_t)msg.arg.pagefault.address,
    966                     true, &rb_offset);
    967            if (!rb) {
    968                error_report("postcopy_ram_fault_thread: Fault outside guest: %"
    969                             PRIx64, (uint64_t)msg.arg.pagefault.address);
    970                break;
    971            }
    972
    973            rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
    974            trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
    975                                                qemu_ram_get_idstr(rb),
    976                                                rb_offset,
    977                                                msg.arg.pagefault.feat.ptid);
    978            mark_postcopy_blocktime_begin(
    979                    (uintptr_t)(msg.arg.pagefault.address),
    980                                msg.arg.pagefault.feat.ptid, rb);
    981
    982retry:
    983            /*
    984             * Send the request to the source - we want to request one
    985             * of our host page sizes (which is >= TPS)
    986             */
    987            ret = migrate_send_rp_req_pages(mis, rb, rb_offset,
    988                                            msg.arg.pagefault.address);
    989            if (ret) {
    990                /* May be network failure, try to wait for recovery */
    991                if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
    992                    /* We got reconnected somehow, try to continue */
    993                    goto retry;
    994                } else {
    995                    /* This is a unavoidable fault */
    996                    error_report("%s: migrate_send_rp_req_pages() get %d",
    997                                 __func__, ret);
    998                    break;
    999                }
   1000            }
   1001        }
   1002
   1003        /* Now handle any requests from external processes on shared memory */
   1004        /* TODO: May need to handle devices deregistering during postcopy */
   1005        for (index = 2; index < pfd_len && poll_result; index++) {
   1006            if (pfd[index].revents) {
   1007                struct PostCopyFD *pcfd =
   1008                    &g_array_index(mis->postcopy_remote_fds,
   1009                                   struct PostCopyFD, index - 2);
   1010
   1011                poll_result--;
   1012                if (pfd[index].revents & POLLERR) {
   1013                    error_report("%s: POLLERR on poll %zd fd=%d",
   1014                                 __func__, index, pcfd->fd);
   1015                    pfd[index].events = 0;
   1016                    continue;
   1017                }
   1018
   1019                ret = read(pcfd->fd, &msg, sizeof(msg));
   1020                if (ret != sizeof(msg)) {
   1021                    if (errno == EAGAIN) {
   1022                        /*
   1023                         * if a wake up happens on the other thread just after
   1024                         * the poll, there is nothing to read.
   1025                         */
   1026                        continue;
   1027                    }
   1028                    if (ret < 0) {
   1029                        error_report("%s: Failed to read full userfault "
   1030                                     "message: %s (shared) revents=%d",
   1031                                     __func__, strerror(errno),
   1032                                     pfd[index].revents);
   1033                        /*TODO: Could just disable this sharer */
   1034                        break;
   1035                    } else {
   1036                        error_report("%s: Read %d bytes from userfaultfd "
   1037                                     "expected %zd (shared)",
   1038                                     __func__, ret, sizeof(msg));
   1039                        /*TODO: Could just disable this sharer */
   1040                        break; /*Lost alignment,don't know what we'd read next*/
   1041                    }
   1042                }
   1043                if (msg.event != UFFD_EVENT_PAGEFAULT) {
   1044                    error_report("%s: Read unexpected event %ud "
   1045                                 "from userfaultfd (shared)",
   1046                                 __func__, msg.event);
   1047                    continue; /* It's not a page fault, shouldn't happen */
   1048                }
   1049                /* Call the device handler registered with us */
   1050                ret = pcfd->handler(pcfd, &msg);
   1051                if (ret) {
   1052                    error_report("%s: Failed to resolve shared fault on %zd/%s",
   1053                                 __func__, index, pcfd->idstr);
   1054                    /* TODO: Fail? Disable this sharer? */
   1055                }
   1056            }
   1057        }
   1058    }
   1059    rcu_unregister_thread();
   1060    trace_postcopy_ram_fault_thread_exit();
   1061    g_free(pfd);
   1062    return NULL;
   1063}
   1064
   1065int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
   1066{
   1067    /* Open the fd for the kernel to give us userfaults */
   1068    mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
   1069    if (mis->userfault_fd == -1) {
   1070        error_report("%s: Failed to open userfault fd: %s", __func__,
   1071                     strerror(errno));
   1072        return -1;
   1073    }
   1074
   1075    /*
   1076     * Although the host check already tested the API, we need to
   1077     * do the check again as an ABI handshake on the new fd.
   1078     */
   1079    if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
   1080        return -1;
   1081    }
   1082
   1083    /* Now an eventfd we use to tell the fault-thread to quit */
   1084    mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
   1085    if (mis->userfault_event_fd == -1) {
   1086        error_report("%s: Opening userfault_event_fd: %s", __func__,
   1087                     strerror(errno));
   1088        close(mis->userfault_fd);
   1089        return -1;
   1090    }
   1091
   1092    qemu_sem_init(&mis->fault_thread_sem, 0);
   1093    qemu_thread_create(&mis->fault_thread, "postcopy/fault",
   1094                       postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
   1095    qemu_sem_wait(&mis->fault_thread_sem);
   1096    qemu_sem_destroy(&mis->fault_thread_sem);
   1097    mis->have_fault_thread = true;
   1098
   1099    /* Mark so that we get notified of accesses to unwritten areas */
   1100    if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
   1101        error_report("ram_block_enable_notify failed");
   1102        return -1;
   1103    }
   1104
   1105    mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
   1106                                  PROT_READ | PROT_WRITE, MAP_PRIVATE |
   1107                                  MAP_ANONYMOUS, -1, 0);
   1108    if (mis->postcopy_tmp_page == MAP_FAILED) {
   1109        mis->postcopy_tmp_page = NULL;
   1110        error_report("%s: Failed to map postcopy_tmp_page %s",
   1111                     __func__, strerror(errno));
   1112        return -1;
   1113    }
   1114
   1115    /*
   1116     * Map large zero page when kernel can't use UFFDIO_ZEROPAGE for hugepages
   1117     */
   1118    mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
   1119                                       PROT_READ | PROT_WRITE,
   1120                                       MAP_PRIVATE | MAP_ANONYMOUS,
   1121                                       -1, 0);
   1122    if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
   1123        int e = errno;
   1124        mis->postcopy_tmp_zero_page = NULL;
   1125        error_report("%s: Failed to map large zero page %s",
   1126                     __func__, strerror(e));
   1127        return -e;
   1128    }
   1129    memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
   1130
   1131    trace_postcopy_ram_enable_notify();
   1132
   1133    return 0;
   1134}
   1135
   1136static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr,
   1137                               void *from_addr, uint64_t pagesize, RAMBlock *rb)
   1138{
   1139    int userfault_fd = mis->userfault_fd;
   1140    int ret;
   1141
   1142    if (from_addr) {
   1143        struct uffdio_copy copy_struct;
   1144        copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
   1145        copy_struct.src = (uint64_t)(uintptr_t)from_addr;
   1146        copy_struct.len = pagesize;
   1147        copy_struct.mode = 0;
   1148        ret = ioctl(userfault_fd, UFFDIO_COPY, &copy_struct);
   1149    } else {
   1150        struct uffdio_zeropage zero_struct;
   1151        zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
   1152        zero_struct.range.len = pagesize;
   1153        zero_struct.mode = 0;
   1154        ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
   1155    }
   1156    if (!ret) {
   1157        qemu_mutex_lock(&mis->page_request_mutex);
   1158        ramblock_recv_bitmap_set_range(rb, host_addr,
   1159                                       pagesize / qemu_target_page_size());
   1160        /*
   1161         * If this page resolves a page fault for a previous recorded faulted
   1162         * address, take a special note to maintain the requested page list.
   1163         */
   1164        if (g_tree_lookup(mis->page_requested, host_addr)) {
   1165            g_tree_remove(mis->page_requested, host_addr);
   1166            mis->page_requested_count--;
   1167            trace_postcopy_page_req_del(host_addr, mis->page_requested_count);
   1168        }
   1169        qemu_mutex_unlock(&mis->page_request_mutex);
   1170        mark_postcopy_blocktime_end((uintptr_t)host_addr);
   1171    }
   1172    return ret;
   1173}
   1174
   1175int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
   1176{
   1177    int i;
   1178    MigrationIncomingState *mis = migration_incoming_get_current();
   1179    GArray *pcrfds = mis->postcopy_remote_fds;
   1180
   1181    for (i = 0; i < pcrfds->len; i++) {
   1182        struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
   1183        int ret = cur->waker(cur, rb, offset);
   1184        if (ret) {
   1185            return ret;
   1186        }
   1187    }
   1188    return 0;
   1189}
   1190
   1191/*
   1192 * Place a host page (from) at (host) atomically
   1193 * returns 0 on success
   1194 */
   1195int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
   1196                        RAMBlock *rb)
   1197{
   1198    size_t pagesize = qemu_ram_pagesize(rb);
   1199
   1200    /* copy also acks to the kernel waking the stalled thread up
   1201     * TODO: We can inhibit that ack and only do it if it was requested
   1202     * which would be slightly cheaper, but we'd have to be careful
   1203     * of the order of updating our page state.
   1204     */
   1205    if (qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb)) {
   1206        int e = errno;
   1207        error_report("%s: %s copy host: %p from: %p (size: %zd)",
   1208                     __func__, strerror(e), host, from, pagesize);
   1209
   1210        return -e;
   1211    }
   1212
   1213    trace_postcopy_place_page(host);
   1214    return postcopy_notify_shared_wake(rb,
   1215                                       qemu_ram_block_host_offset(rb, host));
   1216}
   1217
   1218/*
   1219 * Place a zero page at (host) atomically
   1220 * returns 0 on success
   1221 */
   1222int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
   1223                             RAMBlock *rb)
   1224{
   1225    size_t pagesize = qemu_ram_pagesize(rb);
   1226    trace_postcopy_place_page_zero(host);
   1227
   1228    /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE
   1229     * but it's not available for everything (e.g. hugetlbpages)
   1230     */
   1231    if (qemu_ram_is_uf_zeroable(rb)) {
   1232        if (qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb)) {
   1233            int e = errno;
   1234            error_report("%s: %s zero host: %p",
   1235                         __func__, strerror(e), host);
   1236
   1237            return -e;
   1238        }
   1239        return postcopy_notify_shared_wake(rb,
   1240                                           qemu_ram_block_host_offset(rb,
   1241                                                                      host));
   1242    } else {
   1243        return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
   1244    }
   1245}
   1246
   1247#else
   1248/* No target OS support, stubs just fail */
   1249void fill_destination_postcopy_migration_info(MigrationInfo *info)
   1250{
   1251}
   1252
   1253bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
   1254{
   1255    error_report("%s: No OS support", __func__);
   1256    return false;
   1257}
   1258
   1259int postcopy_ram_incoming_init(MigrationIncomingState *mis)
   1260{
   1261    error_report("postcopy_ram_incoming_init: No OS support");
   1262    return -1;
   1263}
   1264
   1265int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
   1266{
   1267    assert(0);
   1268    return -1;
   1269}
   1270
   1271int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
   1272{
   1273    assert(0);
   1274    return -1;
   1275}
   1276
   1277int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
   1278                                 uint64_t client_addr, uint64_t rb_offset)
   1279{
   1280    assert(0);
   1281    return -1;
   1282}
   1283
   1284int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
   1285{
   1286    assert(0);
   1287    return -1;
   1288}
   1289
   1290int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
   1291                        RAMBlock *rb)
   1292{
   1293    assert(0);
   1294    return -1;
   1295}
   1296
   1297int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
   1298                        RAMBlock *rb)
   1299{
   1300    assert(0);
   1301    return -1;
   1302}
   1303
   1304int postcopy_wake_shared(struct PostCopyFD *pcfd,
   1305                         uint64_t client_addr,
   1306                         RAMBlock *rb)
   1307{
   1308    assert(0);
   1309    return -1;
   1310}
   1311#endif
   1312
   1313/* ------------------------------------------------------------------------- */
   1314
   1315void postcopy_fault_thread_notify(MigrationIncomingState *mis)
   1316{
   1317    uint64_t tmp64 = 1;
   1318
   1319    /*
   1320     * Wakeup the fault_thread.  It's an eventfd that should currently
   1321     * be at 0, we're going to increment it to 1
   1322     */
   1323    if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
   1324        /* Not much we can do here, but may as well report it */
   1325        error_report("%s: incrementing failed: %s", __func__,
   1326                     strerror(errno));
   1327    }
   1328}
   1329
   1330/**
   1331 * postcopy_discard_send_init: Called at the start of each RAMBlock before
   1332 *   asking to discard individual ranges.
   1333 *
   1334 * @ms: The current migration state.
   1335 * @offset: the bitmap offset of the named RAMBlock in the migration bitmap.
   1336 * @name: RAMBlock that discards will operate on.
   1337 */
   1338static PostcopyDiscardState pds = {0};
   1339void postcopy_discard_send_init(MigrationState *ms, const char *name)
   1340{
   1341    pds.ramblock_name = name;
   1342    pds.cur_entry = 0;
   1343    pds.nsentwords = 0;
   1344    pds.nsentcmds = 0;
   1345}
   1346
   1347/**
   1348 * postcopy_discard_send_range: Called by the bitmap code for each chunk to
   1349 *   discard. May send a discard message, may just leave it queued to
   1350 *   be sent later.
   1351 *
   1352 * @ms: Current migration state.
   1353 * @start,@length: a range of pages in the migration bitmap in the
   1354 *   RAM block passed to postcopy_discard_send_init() (length=1 is one page)
   1355 */
   1356void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
   1357                                 unsigned long length)
   1358{
   1359    size_t tp_size = qemu_target_page_size();
   1360    /* Convert to byte offsets within the RAM block */
   1361    pds.start_list[pds.cur_entry] = start  * tp_size;
   1362    pds.length_list[pds.cur_entry] = length * tp_size;
   1363    trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
   1364    pds.cur_entry++;
   1365    pds.nsentwords++;
   1366
   1367    if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
   1368        /* Full set, ship it! */
   1369        qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
   1370                                              pds.ramblock_name,
   1371                                              pds.cur_entry,
   1372                                              pds.start_list,
   1373                                              pds.length_list);
   1374        pds.nsentcmds++;
   1375        pds.cur_entry = 0;
   1376    }
   1377}
   1378
   1379/**
   1380 * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
   1381 * bitmap code. Sends any outstanding discard messages, frees the PDS
   1382 *
   1383 * @ms: Current migration state.
   1384 */
   1385void postcopy_discard_send_finish(MigrationState *ms)
   1386{
   1387    /* Anything unsent? */
   1388    if (pds.cur_entry) {
   1389        qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
   1390                                              pds.ramblock_name,
   1391                                              pds.cur_entry,
   1392                                              pds.start_list,
   1393                                              pds.length_list);
   1394        pds.nsentcmds++;
   1395    }
   1396
   1397    trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
   1398                                       pds.nsentcmds);
   1399}
   1400
   1401/*
   1402 * Current state of incoming postcopy; note this is not part of
   1403 * MigrationIncomingState since it's state is used during cleanup
   1404 * at the end as MIS is being freed.
   1405 */
   1406static PostcopyState incoming_postcopy_state;
   1407
   1408PostcopyState  postcopy_state_get(void)
   1409{
   1410    return qatomic_mb_read(&incoming_postcopy_state);
   1411}
   1412
   1413/* Set the state and return the old state */
   1414PostcopyState postcopy_state_set(PostcopyState new_state)
   1415{
   1416    return qatomic_xchg(&incoming_postcopy_state, new_state);
   1417}
   1418
   1419/* Register a handler for external shared memory postcopy
   1420 * called on the destination.
   1421 */
   1422void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
   1423{
   1424    MigrationIncomingState *mis = migration_incoming_get_current();
   1425
   1426    mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
   1427                                                  *pcfd);
   1428}
   1429
   1430/* Unregister a handler for external shared memory postcopy
   1431 */
   1432void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
   1433{
   1434    guint i;
   1435    MigrationIncomingState *mis = migration_incoming_get_current();
   1436    GArray *pcrfds = mis->postcopy_remote_fds;
   1437
   1438    for (i = 0; i < pcrfds->len; i++) {
   1439        struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
   1440        if (cur->fd == pcfd->fd) {
   1441            mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
   1442            return;
   1443        }
   1444    }
   1445}