cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

io.c (112031B)


      1/*
      2 * Block layer I/O functions
      3 *
      4 * Copyright (c) 2003 Fabrice Bellard
      5 *
      6 * Permission is hereby granted, free of charge, to any person obtaining a copy
      7 * of this software and associated documentation files (the "Software"), to deal
      8 * in the Software without restriction, including without limitation the rights
      9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10 * copies of the Software, and to permit persons to whom the Software is
     11 * furnished to do so, subject to the following conditions:
     12 *
     13 * The above copyright notice and this permission notice shall be included in
     14 * all copies or substantial portions of the Software.
     15 *
     16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22 * THE SOFTWARE.
     23 */
     24
     25#include "qemu/osdep.h"
     26#include "trace.h"
     27#include "sysemu/block-backend.h"
     28#include "block/aio-wait.h"
     29#include "block/blockjob.h"
     30#include "block/blockjob_int.h"
     31#include "block/block_int.h"
     32#include "block/coroutines.h"
     33#include "block/write-threshold.h"
     34#include "qemu/cutils.h"
     35#include "qapi/error.h"
     36#include "qemu/error-report.h"
     37#include "qemu/main-loop.h"
     38#include "sysemu/replay.h"
     39
     40/* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
     41#define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
     42
     43static void bdrv_parent_cb_resize(BlockDriverState *bs);
     44static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
     45    int64_t offset, int64_t bytes, BdrvRequestFlags flags);
     46
     47static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
     48                                      bool ignore_bds_parents)
     49{
     50    BdrvChild *c, *next;
     51
     52    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
     53        if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
     54            continue;
     55        }
     56        bdrv_parent_drained_begin_single(c, false);
     57    }
     58}
     59
     60static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c,
     61                                                   int *drained_end_counter)
     62{
     63    assert(c->parent_quiesce_counter > 0);
     64    c->parent_quiesce_counter--;
     65    if (c->klass->drained_end) {
     66        c->klass->drained_end(c, drained_end_counter);
     67    }
     68}
     69
     70void bdrv_parent_drained_end_single(BdrvChild *c)
     71{
     72    int drained_end_counter = 0;
     73    bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter);
     74    BDRV_POLL_WHILE(c->bs, qatomic_read(&drained_end_counter) > 0);
     75}
     76
     77static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
     78                                    bool ignore_bds_parents,
     79                                    int *drained_end_counter)
     80{
     81    BdrvChild *c;
     82
     83    QLIST_FOREACH(c, &bs->parents, next_parent) {
     84        if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
     85            continue;
     86        }
     87        bdrv_parent_drained_end_single_no_poll(c, drained_end_counter);
     88    }
     89}
     90
     91static bool bdrv_parent_drained_poll_single(BdrvChild *c)
     92{
     93    if (c->klass->drained_poll) {
     94        return c->klass->drained_poll(c);
     95    }
     96    return false;
     97}
     98
     99static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
    100                                     bool ignore_bds_parents)
    101{
    102    BdrvChild *c, *next;
    103    bool busy = false;
    104
    105    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
    106        if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
    107            continue;
    108        }
    109        busy |= bdrv_parent_drained_poll_single(c);
    110    }
    111
    112    return busy;
    113}
    114
    115void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
    116{
    117    c->parent_quiesce_counter++;
    118    if (c->klass->drained_begin) {
    119        c->klass->drained_begin(c);
    120    }
    121    if (poll) {
    122        BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c));
    123    }
    124}
    125
    126static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
    127{
    128    dst->pdiscard_alignment = MAX(dst->pdiscard_alignment,
    129                                  src->pdiscard_alignment);
    130    dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
    131    dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
    132    dst->max_hw_transfer = MIN_NON_ZERO(dst->max_hw_transfer,
    133                                        src->max_hw_transfer);
    134    dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
    135                                 src->opt_mem_alignment);
    136    dst->min_mem_alignment = MAX(dst->min_mem_alignment,
    137                                 src->min_mem_alignment);
    138    dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
    139    dst->max_hw_iov = MIN_NON_ZERO(dst->max_hw_iov, src->max_hw_iov);
    140}
    141
    142typedef struct BdrvRefreshLimitsState {
    143    BlockDriverState *bs;
    144    BlockLimits old_bl;
    145} BdrvRefreshLimitsState;
    146
    147static void bdrv_refresh_limits_abort(void *opaque)
    148{
    149    BdrvRefreshLimitsState *s = opaque;
    150
    151    s->bs->bl = s->old_bl;
    152}
    153
    154static TransactionActionDrv bdrv_refresh_limits_drv = {
    155    .abort = bdrv_refresh_limits_abort,
    156    .clean = g_free,
    157};
    158
    159/* @tran is allowed to be NULL, in this case no rollback is possible. */
    160void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp)
    161{
    162    ERRP_GUARD();
    163    BlockDriver *drv = bs->drv;
    164    BdrvChild *c;
    165    bool have_limits;
    166
    167    if (tran) {
    168        BdrvRefreshLimitsState *s = g_new(BdrvRefreshLimitsState, 1);
    169        *s = (BdrvRefreshLimitsState) {
    170            .bs = bs,
    171            .old_bl = bs->bl,
    172        };
    173        tran_add(tran, &bdrv_refresh_limits_drv, s);
    174    }
    175
    176    memset(&bs->bl, 0, sizeof(bs->bl));
    177
    178    if (!drv) {
    179        return;
    180    }
    181
    182    /* Default alignment based on whether driver has byte interface */
    183    bs->bl.request_alignment = (drv->bdrv_co_preadv ||
    184                                drv->bdrv_aio_preadv ||
    185                                drv->bdrv_co_preadv_part) ? 1 : 512;
    186
    187    /* Take some limits from the children as a default */
    188    have_limits = false;
    189    QLIST_FOREACH(c, &bs->children, next) {
    190        if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | BDRV_CHILD_COW))
    191        {
    192            bdrv_refresh_limits(c->bs, tran, errp);
    193            if (*errp) {
    194                return;
    195            }
    196            bdrv_merge_limits(&bs->bl, &c->bs->bl);
    197            have_limits = true;
    198        }
    199    }
    200
    201    if (!have_limits) {
    202        bs->bl.min_mem_alignment = 512;
    203        bs->bl.opt_mem_alignment = qemu_real_host_page_size;
    204
    205        /* Safe default since most protocols use readv()/writev()/etc */
    206        bs->bl.max_iov = IOV_MAX;
    207    }
    208
    209    /* Then let the driver override it */
    210    if (drv->bdrv_refresh_limits) {
    211        drv->bdrv_refresh_limits(bs, errp);
    212        if (*errp) {
    213            return;
    214        }
    215    }
    216
    217    if (bs->bl.request_alignment > BDRV_MAX_ALIGNMENT) {
    218        error_setg(errp, "Driver requires too large request alignment");
    219    }
    220}
    221
    222/**
    223 * The copy-on-read flag is actually a reference count so multiple users may
    224 * use the feature without worrying about clobbering its previous state.
    225 * Copy-on-read stays enabled until all users have called to disable it.
    226 */
    227void bdrv_enable_copy_on_read(BlockDriverState *bs)
    228{
    229    qatomic_inc(&bs->copy_on_read);
    230}
    231
    232void bdrv_disable_copy_on_read(BlockDriverState *bs)
    233{
    234    int old = qatomic_fetch_dec(&bs->copy_on_read);
    235    assert(old >= 1);
    236}
    237
    238typedef struct {
    239    Coroutine *co;
    240    BlockDriverState *bs;
    241    bool done;
    242    bool begin;
    243    bool recursive;
    244    bool poll;
    245    BdrvChild *parent;
    246    bool ignore_bds_parents;
    247    int *drained_end_counter;
    248} BdrvCoDrainData;
    249
    250static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
    251{
    252    BdrvCoDrainData *data = opaque;
    253    BlockDriverState *bs = data->bs;
    254
    255    if (data->begin) {
    256        bs->drv->bdrv_co_drain_begin(bs);
    257    } else {
    258        bs->drv->bdrv_co_drain_end(bs);
    259    }
    260
    261    /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */
    262    qatomic_mb_set(&data->done, true);
    263    if (!data->begin) {
    264        qatomic_dec(data->drained_end_counter);
    265    }
    266    bdrv_dec_in_flight(bs);
    267
    268    g_free(data);
    269}
    270
    271/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
    272static void bdrv_drain_invoke(BlockDriverState *bs, bool begin,
    273                              int *drained_end_counter)
    274{
    275    BdrvCoDrainData *data;
    276
    277    if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
    278            (!begin && !bs->drv->bdrv_co_drain_end)) {
    279        return;
    280    }
    281
    282    data = g_new(BdrvCoDrainData, 1);
    283    *data = (BdrvCoDrainData) {
    284        .bs = bs,
    285        .done = false,
    286        .begin = begin,
    287        .drained_end_counter = drained_end_counter,
    288    };
    289
    290    if (!begin) {
    291        qatomic_inc(drained_end_counter);
    292    }
    293
    294    /* Make sure the driver callback completes during the polling phase for
    295     * drain_begin. */
    296    bdrv_inc_in_flight(bs);
    297    data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
    298    aio_co_schedule(bdrv_get_aio_context(bs), data->co);
    299}
    300
    301/* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
    302bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
    303                     BdrvChild *ignore_parent, bool ignore_bds_parents)
    304{
    305    BdrvChild *child, *next;
    306
    307    if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
    308        return true;
    309    }
    310
    311    if (qatomic_read(&bs->in_flight)) {
    312        return true;
    313    }
    314
    315    if (recursive) {
    316        assert(!ignore_bds_parents);
    317        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
    318            if (bdrv_drain_poll(child->bs, recursive, child, false)) {
    319                return true;
    320            }
    321        }
    322    }
    323
    324    return false;
    325}
    326
    327static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
    328                                      BdrvChild *ignore_parent)
    329{
    330    return bdrv_drain_poll(bs, recursive, ignore_parent, false);
    331}
    332
    333static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
    334                                  BdrvChild *parent, bool ignore_bds_parents,
    335                                  bool poll);
    336static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
    337                                BdrvChild *parent, bool ignore_bds_parents,
    338                                int *drained_end_counter);
    339
    340static void bdrv_co_drain_bh_cb(void *opaque)
    341{
    342    BdrvCoDrainData *data = opaque;
    343    Coroutine *co = data->co;
    344    BlockDriverState *bs = data->bs;
    345
    346    if (bs) {
    347        AioContext *ctx = bdrv_get_aio_context(bs);
    348        aio_context_acquire(ctx);
    349        bdrv_dec_in_flight(bs);
    350        if (data->begin) {
    351            assert(!data->drained_end_counter);
    352            bdrv_do_drained_begin(bs, data->recursive, data->parent,
    353                                  data->ignore_bds_parents, data->poll);
    354        } else {
    355            assert(!data->poll);
    356            bdrv_do_drained_end(bs, data->recursive, data->parent,
    357                                data->ignore_bds_parents,
    358                                data->drained_end_counter);
    359        }
    360        aio_context_release(ctx);
    361    } else {
    362        assert(data->begin);
    363        bdrv_drain_all_begin();
    364    }
    365
    366    data->done = true;
    367    aio_co_wake(co);
    368}
    369
    370static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
    371                                                bool begin, bool recursive,
    372                                                BdrvChild *parent,
    373                                                bool ignore_bds_parents,
    374                                                bool poll,
    375                                                int *drained_end_counter)
    376{
    377    BdrvCoDrainData data;
    378    Coroutine *self = qemu_coroutine_self();
    379    AioContext *ctx = bdrv_get_aio_context(bs);
    380    AioContext *co_ctx = qemu_coroutine_get_aio_context(self);
    381
    382    /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
    383     * other coroutines run if they were queued by aio_co_enter(). */
    384
    385    assert(qemu_in_coroutine());
    386    data = (BdrvCoDrainData) {
    387        .co = self,
    388        .bs = bs,
    389        .done = false,
    390        .begin = begin,
    391        .recursive = recursive,
    392        .parent = parent,
    393        .ignore_bds_parents = ignore_bds_parents,
    394        .poll = poll,
    395        .drained_end_counter = drained_end_counter,
    396    };
    397
    398    if (bs) {
    399        bdrv_inc_in_flight(bs);
    400    }
    401
    402    /*
    403     * Temporarily drop the lock across yield or we would get deadlocks.
    404     * bdrv_co_drain_bh_cb() reaquires the lock as needed.
    405     *
    406     * When we yield below, the lock for the current context will be
    407     * released, so if this is actually the lock that protects bs, don't drop
    408     * it a second time.
    409     */
    410    if (ctx != co_ctx) {
    411        aio_context_release(ctx);
    412    }
    413    replay_bh_schedule_oneshot_event(ctx, bdrv_co_drain_bh_cb, &data);
    414
    415    qemu_coroutine_yield();
    416    /* If we are resumed from some other event (such as an aio completion or a
    417     * timer callback), it is a bug in the caller that should be fixed. */
    418    assert(data.done);
    419
    420    /* Reaquire the AioContext of bs if we dropped it */
    421    if (ctx != co_ctx) {
    422        aio_context_acquire(ctx);
    423    }
    424}
    425
    426void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
    427                                   BdrvChild *parent, bool ignore_bds_parents)
    428{
    429    assert(!qemu_in_coroutine());
    430
    431    /* Stop things in parent-to-child order */
    432    if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) {
    433        aio_disable_external(bdrv_get_aio_context(bs));
    434    }
    435
    436    bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
    437    bdrv_drain_invoke(bs, true, NULL);
    438}
    439
    440static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
    441                                  BdrvChild *parent, bool ignore_bds_parents,
    442                                  bool poll)
    443{
    444    BdrvChild *child, *next;
    445
    446    if (qemu_in_coroutine()) {
    447        bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
    448                               poll, NULL);
    449        return;
    450    }
    451
    452    bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
    453
    454    if (recursive) {
    455        assert(!ignore_bds_parents);
    456        bs->recursive_quiesce_counter++;
    457        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
    458            bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
    459                                  false);
    460        }
    461    }
    462
    463    /*
    464     * Wait for drained requests to finish.
    465     *
    466     * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
    467     * call is needed so things in this AioContext can make progress even
    468     * though we don't return to the main AioContext loop - this automatically
    469     * includes other nodes in the same AioContext and therefore all child
    470     * nodes.
    471     */
    472    if (poll) {
    473        assert(!ignore_bds_parents);
    474        BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
    475    }
    476}
    477
    478void bdrv_drained_begin(BlockDriverState *bs)
    479{
    480    bdrv_do_drained_begin(bs, false, NULL, false, true);
    481}
    482
    483void bdrv_subtree_drained_begin(BlockDriverState *bs)
    484{
    485    bdrv_do_drained_begin(bs, true, NULL, false, true);
    486}
    487
    488/**
    489 * This function does not poll, nor must any of its recursively called
    490 * functions.  The *drained_end_counter pointee will be incremented
    491 * once for every background operation scheduled, and decremented once
    492 * the operation settles.  Therefore, the pointer must remain valid
    493 * until the pointee reaches 0.  That implies that whoever sets up the
    494 * pointee has to poll until it is 0.
    495 *
    496 * We use atomic operations to access *drained_end_counter, because
    497 * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of
    498 *     @bs may contain nodes in different AioContexts,
    499 * (2) bdrv_drain_all_end() uses the same counter for all nodes,
    500 *     regardless of which AioContext they are in.
    501 */
    502static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
    503                                BdrvChild *parent, bool ignore_bds_parents,
    504                                int *drained_end_counter)
    505{
    506    BdrvChild *child;
    507    int old_quiesce_counter;
    508
    509    assert(drained_end_counter != NULL);
    510
    511    if (qemu_in_coroutine()) {
    512        bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
    513                               false, drained_end_counter);
    514        return;
    515    }
    516    assert(bs->quiesce_counter > 0);
    517
    518    /* Re-enable things in child-to-parent order */
    519    bdrv_drain_invoke(bs, false, drained_end_counter);
    520    bdrv_parent_drained_end(bs, parent, ignore_bds_parents,
    521                            drained_end_counter);
    522
    523    old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter);
    524    if (old_quiesce_counter == 1) {
    525        aio_enable_external(bdrv_get_aio_context(bs));
    526    }
    527
    528    if (recursive) {
    529        assert(!ignore_bds_parents);
    530        bs->recursive_quiesce_counter--;
    531        QLIST_FOREACH(child, &bs->children, next) {
    532            bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents,
    533                                drained_end_counter);
    534        }
    535    }
    536}
    537
    538void bdrv_drained_end(BlockDriverState *bs)
    539{
    540    int drained_end_counter = 0;
    541    bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter);
    542    BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0);
    543}
    544
    545void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter)
    546{
    547    bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter);
    548}
    549
    550void bdrv_subtree_drained_end(BlockDriverState *bs)
    551{
    552    int drained_end_counter = 0;
    553    bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter);
    554    BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0);
    555}
    556
    557void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
    558{
    559    int i;
    560
    561    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
    562        bdrv_do_drained_begin(child->bs, true, child, false, true);
    563    }
    564}
    565
    566void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
    567{
    568    int drained_end_counter = 0;
    569    int i;
    570
    571    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
    572        bdrv_do_drained_end(child->bs, true, child, false,
    573                            &drained_end_counter);
    574    }
    575
    576    BDRV_POLL_WHILE(child->bs, qatomic_read(&drained_end_counter) > 0);
    577}
    578
    579/*
    580 * Wait for pending requests to complete on a single BlockDriverState subtree,
    581 * and suspend block driver's internal I/O until next request arrives.
    582 *
    583 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
    584 * AioContext.
    585 */
    586void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
    587{
    588    assert(qemu_in_coroutine());
    589    bdrv_drained_begin(bs);
    590    bdrv_drained_end(bs);
    591}
    592
    593void bdrv_drain(BlockDriverState *bs)
    594{
    595    bdrv_drained_begin(bs);
    596    bdrv_drained_end(bs);
    597}
    598
    599static void bdrv_drain_assert_idle(BlockDriverState *bs)
    600{
    601    BdrvChild *child, *next;
    602
    603    assert(qatomic_read(&bs->in_flight) == 0);
    604    QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
    605        bdrv_drain_assert_idle(child->bs);
    606    }
    607}
    608
    609unsigned int bdrv_drain_all_count = 0;
    610
    611static bool bdrv_drain_all_poll(void)
    612{
    613    BlockDriverState *bs = NULL;
    614    bool result = false;
    615
    616    /* bdrv_drain_poll() can't make changes to the graph and we are holding the
    617     * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
    618    while ((bs = bdrv_next_all_states(bs))) {
    619        AioContext *aio_context = bdrv_get_aio_context(bs);
    620        aio_context_acquire(aio_context);
    621        result |= bdrv_drain_poll(bs, false, NULL, true);
    622        aio_context_release(aio_context);
    623    }
    624
    625    return result;
    626}
    627
    628/*
    629 * Wait for pending requests to complete across all BlockDriverStates
    630 *
    631 * This function does not flush data to disk, use bdrv_flush_all() for that
    632 * after calling this function.
    633 *
    634 * This pauses all block jobs and disables external clients. It must
    635 * be paired with bdrv_drain_all_end().
    636 *
    637 * NOTE: no new block jobs or BlockDriverStates can be created between
    638 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
    639 */
    640void bdrv_drain_all_begin(void)
    641{
    642    BlockDriverState *bs = NULL;
    643
    644    if (qemu_in_coroutine()) {
    645        bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL);
    646        return;
    647    }
    648
    649    /*
    650     * bdrv queue is managed by record/replay,
    651     * waiting for finishing the I/O requests may
    652     * be infinite
    653     */
    654    if (replay_events_enabled()) {
    655        return;
    656    }
    657
    658    /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
    659     * loop AioContext, so make sure we're in the main context. */
    660    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
    661    assert(bdrv_drain_all_count < INT_MAX);
    662    bdrv_drain_all_count++;
    663
    664    /* Quiesce all nodes, without polling in-flight requests yet. The graph
    665     * cannot change during this loop. */
    666    while ((bs = bdrv_next_all_states(bs))) {
    667        AioContext *aio_context = bdrv_get_aio_context(bs);
    668
    669        aio_context_acquire(aio_context);
    670        bdrv_do_drained_begin(bs, false, NULL, true, false);
    671        aio_context_release(aio_context);
    672    }
    673
    674    /* Now poll the in-flight requests */
    675    AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll());
    676
    677    while ((bs = bdrv_next_all_states(bs))) {
    678        bdrv_drain_assert_idle(bs);
    679    }
    680}
    681
    682void bdrv_drain_all_end_quiesce(BlockDriverState *bs)
    683{
    684    int drained_end_counter = 0;
    685
    686    g_assert(bs->quiesce_counter > 0);
    687    g_assert(!bs->refcnt);
    688
    689    while (bs->quiesce_counter) {
    690        bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter);
    691    }
    692    BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0);
    693}
    694
    695void bdrv_drain_all_end(void)
    696{
    697    BlockDriverState *bs = NULL;
    698    int drained_end_counter = 0;
    699
    700    /*
    701     * bdrv queue is managed by record/replay,
    702     * waiting for finishing the I/O requests may
    703     * be endless
    704     */
    705    if (replay_events_enabled()) {
    706        return;
    707    }
    708
    709    while ((bs = bdrv_next_all_states(bs))) {
    710        AioContext *aio_context = bdrv_get_aio_context(bs);
    711
    712        aio_context_acquire(aio_context);
    713        bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter);
    714        aio_context_release(aio_context);
    715    }
    716
    717    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
    718    AIO_WAIT_WHILE(NULL, qatomic_read(&drained_end_counter) > 0);
    719
    720    assert(bdrv_drain_all_count > 0);
    721    bdrv_drain_all_count--;
    722}
    723
    724void bdrv_drain_all(void)
    725{
    726    bdrv_drain_all_begin();
    727    bdrv_drain_all_end();
    728}
    729
    730/**
    731 * Remove an active request from the tracked requests list
    732 *
    733 * This function should be called when a tracked request is completing.
    734 */
    735static void tracked_request_end(BdrvTrackedRequest *req)
    736{
    737    if (req->serialising) {
    738        qatomic_dec(&req->bs->serialising_in_flight);
    739    }
    740
    741    qemu_co_mutex_lock(&req->bs->reqs_lock);
    742    QLIST_REMOVE(req, list);
    743    qemu_co_queue_restart_all(&req->wait_queue);
    744    qemu_co_mutex_unlock(&req->bs->reqs_lock);
    745}
    746
    747/**
    748 * Add an active request to the tracked requests list
    749 */
    750static void tracked_request_begin(BdrvTrackedRequest *req,
    751                                  BlockDriverState *bs,
    752                                  int64_t offset,
    753                                  int64_t bytes,
    754                                  enum BdrvTrackedRequestType type)
    755{
    756    bdrv_check_request(offset, bytes, &error_abort);
    757
    758    *req = (BdrvTrackedRequest){
    759        .bs = bs,
    760        .offset         = offset,
    761        .bytes          = bytes,
    762        .type           = type,
    763        .co             = qemu_coroutine_self(),
    764        .serialising    = false,
    765        .overlap_offset = offset,
    766        .overlap_bytes  = bytes,
    767    };
    768
    769    qemu_co_queue_init(&req->wait_queue);
    770
    771    qemu_co_mutex_lock(&bs->reqs_lock);
    772    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
    773    qemu_co_mutex_unlock(&bs->reqs_lock);
    774}
    775
    776static bool tracked_request_overlaps(BdrvTrackedRequest *req,
    777                                     int64_t offset, int64_t bytes)
    778{
    779    bdrv_check_request(offset, bytes, &error_abort);
    780
    781    /*        aaaa   bbbb */
    782    if (offset >= req->overlap_offset + req->overlap_bytes) {
    783        return false;
    784    }
    785    /* bbbb   aaaa        */
    786    if (req->overlap_offset >= offset + bytes) {
    787        return false;
    788    }
    789    return true;
    790}
    791
    792/* Called with self->bs->reqs_lock held */
    793static BdrvTrackedRequest *
    794bdrv_find_conflicting_request(BdrvTrackedRequest *self)
    795{
    796    BdrvTrackedRequest *req;
    797
    798    QLIST_FOREACH(req, &self->bs->tracked_requests, list) {
    799        if (req == self || (!req->serialising && !self->serialising)) {
    800            continue;
    801        }
    802        if (tracked_request_overlaps(req, self->overlap_offset,
    803                                     self->overlap_bytes))
    804        {
    805            /*
    806             * Hitting this means there was a reentrant request, for
    807             * example, a block driver issuing nested requests.  This must
    808             * never happen since it means deadlock.
    809             */
    810            assert(qemu_coroutine_self() != req->co);
    811
    812            /*
    813             * If the request is already (indirectly) waiting for us, or
    814             * will wait for us as soon as it wakes up, then just go on
    815             * (instead of producing a deadlock in the former case).
    816             */
    817            if (!req->waiting_for) {
    818                return req;
    819            }
    820        }
    821    }
    822
    823    return NULL;
    824}
    825
    826/* Called with self->bs->reqs_lock held */
    827static bool coroutine_fn
    828bdrv_wait_serialising_requests_locked(BdrvTrackedRequest *self)
    829{
    830    BdrvTrackedRequest *req;
    831    bool waited = false;
    832
    833    while ((req = bdrv_find_conflicting_request(self))) {
    834        self->waiting_for = req;
    835        qemu_co_queue_wait(&req->wait_queue, &self->bs->reqs_lock);
    836        self->waiting_for = NULL;
    837        waited = true;
    838    }
    839
    840    return waited;
    841}
    842
    843/* Called with req->bs->reqs_lock held */
    844static void tracked_request_set_serialising(BdrvTrackedRequest *req,
    845                                            uint64_t align)
    846{
    847    int64_t overlap_offset = req->offset & ~(align - 1);
    848    int64_t overlap_bytes =
    849        ROUND_UP(req->offset + req->bytes, align) - overlap_offset;
    850
    851    bdrv_check_request(req->offset, req->bytes, &error_abort);
    852
    853    if (!req->serialising) {
    854        qatomic_inc(&req->bs->serialising_in_flight);
    855        req->serialising = true;
    856    }
    857
    858    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
    859    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
    860}
    861
    862/**
    863 * Return the tracked request on @bs for the current coroutine, or
    864 * NULL if there is none.
    865 */
    866BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs)
    867{
    868    BdrvTrackedRequest *req;
    869    Coroutine *self = qemu_coroutine_self();
    870
    871    QLIST_FOREACH(req, &bs->tracked_requests, list) {
    872        if (req->co == self) {
    873            return req;
    874        }
    875    }
    876
    877    return NULL;
    878}
    879
    880/**
    881 * Round a region to cluster boundaries
    882 */
    883void bdrv_round_to_clusters(BlockDriverState *bs,
    884                            int64_t offset, int64_t bytes,
    885                            int64_t *cluster_offset,
    886                            int64_t *cluster_bytes)
    887{
    888    BlockDriverInfo bdi;
    889
    890    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
    891        *cluster_offset = offset;
    892        *cluster_bytes = bytes;
    893    } else {
    894        int64_t c = bdi.cluster_size;
    895        *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
    896        *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
    897    }
    898}
    899
    900static int bdrv_get_cluster_size(BlockDriverState *bs)
    901{
    902    BlockDriverInfo bdi;
    903    int ret;
    904
    905    ret = bdrv_get_info(bs, &bdi);
    906    if (ret < 0 || bdi.cluster_size == 0) {
    907        return bs->bl.request_alignment;
    908    } else {
    909        return bdi.cluster_size;
    910    }
    911}
    912
    913void bdrv_inc_in_flight(BlockDriverState *bs)
    914{
    915    qatomic_inc(&bs->in_flight);
    916}
    917
    918void bdrv_wakeup(BlockDriverState *bs)
    919{
    920    aio_wait_kick();
    921}
    922
    923void bdrv_dec_in_flight(BlockDriverState *bs)
    924{
    925    qatomic_dec(&bs->in_flight);
    926    bdrv_wakeup(bs);
    927}
    928
    929static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self)
    930{
    931    BlockDriverState *bs = self->bs;
    932    bool waited = false;
    933
    934    if (!qatomic_read(&bs->serialising_in_flight)) {
    935        return false;
    936    }
    937
    938    qemu_co_mutex_lock(&bs->reqs_lock);
    939    waited = bdrv_wait_serialising_requests_locked(self);
    940    qemu_co_mutex_unlock(&bs->reqs_lock);
    941
    942    return waited;
    943}
    944
    945bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req,
    946                                                uint64_t align)
    947{
    948    bool waited;
    949
    950    qemu_co_mutex_lock(&req->bs->reqs_lock);
    951
    952    tracked_request_set_serialising(req, align);
    953    waited = bdrv_wait_serialising_requests_locked(req);
    954
    955    qemu_co_mutex_unlock(&req->bs->reqs_lock);
    956
    957    return waited;
    958}
    959
    960int bdrv_check_qiov_request(int64_t offset, int64_t bytes,
    961                            QEMUIOVector *qiov, size_t qiov_offset,
    962                            Error **errp)
    963{
    964    /*
    965     * Check generic offset/bytes correctness
    966     */
    967
    968    if (offset < 0) {
    969        error_setg(errp, "offset is negative: %" PRIi64, offset);
    970        return -EIO;
    971    }
    972
    973    if (bytes < 0) {
    974        error_setg(errp, "bytes is negative: %" PRIi64, bytes);
    975        return -EIO;
    976    }
    977
    978    if (bytes > BDRV_MAX_LENGTH) {
    979        error_setg(errp, "bytes(%" PRIi64 ") exceeds maximum(%" PRIi64 ")",
    980                   bytes, BDRV_MAX_LENGTH);
    981        return -EIO;
    982    }
    983
    984    if (offset > BDRV_MAX_LENGTH) {
    985        error_setg(errp, "offset(%" PRIi64 ") exceeds maximum(%" PRIi64 ")",
    986                   offset, BDRV_MAX_LENGTH);
    987        return -EIO;
    988    }
    989
    990    if (offset > BDRV_MAX_LENGTH - bytes) {
    991        error_setg(errp, "sum of offset(%" PRIi64 ") and bytes(%" PRIi64 ") "
    992                   "exceeds maximum(%" PRIi64 ")", offset, bytes,
    993                   BDRV_MAX_LENGTH);
    994        return -EIO;
    995    }
    996
    997    if (!qiov) {
    998        return 0;
    999    }
   1000
   1001    /*
   1002     * Check qiov and qiov_offset
   1003     */
   1004
   1005    if (qiov_offset > qiov->size) {
   1006        error_setg(errp, "qiov_offset(%zu) overflow io vector size(%zu)",
   1007                   qiov_offset, qiov->size);
   1008        return -EIO;
   1009    }
   1010
   1011    if (bytes > qiov->size - qiov_offset) {
   1012        error_setg(errp, "bytes(%" PRIi64 ") + qiov_offset(%zu) overflow io "
   1013                   "vector size(%zu)", bytes, qiov_offset, qiov->size);
   1014        return -EIO;
   1015    }
   1016
   1017    return 0;
   1018}
   1019
   1020int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp)
   1021{
   1022    return bdrv_check_qiov_request(offset, bytes, NULL, 0, errp);
   1023}
   1024
   1025static int bdrv_check_request32(int64_t offset, int64_t bytes,
   1026                                QEMUIOVector *qiov, size_t qiov_offset)
   1027{
   1028    int ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL);
   1029    if (ret < 0) {
   1030        return ret;
   1031    }
   1032
   1033    if (bytes > BDRV_REQUEST_MAX_BYTES) {
   1034        return -EIO;
   1035    }
   1036
   1037    return 0;
   1038}
   1039
   1040int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
   1041                       int64_t bytes, BdrvRequestFlags flags)
   1042{
   1043    return bdrv_pwritev(child, offset, bytes, NULL,
   1044                        BDRV_REQ_ZERO_WRITE | flags);
   1045}
   1046
   1047/*
   1048 * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
   1049 * The operation is sped up by checking the block status and only writing
   1050 * zeroes to the device if they currently do not return zeroes. Optional
   1051 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
   1052 * BDRV_REQ_FUA).
   1053 *
   1054 * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite().
   1055 */
   1056int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
   1057{
   1058    int ret;
   1059    int64_t target_size, bytes, offset = 0;
   1060    BlockDriverState *bs = child->bs;
   1061
   1062    target_size = bdrv_getlength(bs);
   1063    if (target_size < 0) {
   1064        return target_size;
   1065    }
   1066
   1067    for (;;) {
   1068        bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
   1069        if (bytes <= 0) {
   1070            return 0;
   1071        }
   1072        ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
   1073        if (ret < 0) {
   1074            return ret;
   1075        }
   1076        if (ret & BDRV_BLOCK_ZERO) {
   1077            offset += bytes;
   1078            continue;
   1079        }
   1080        ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
   1081        if (ret < 0) {
   1082            return ret;
   1083        }
   1084        offset += bytes;
   1085    }
   1086}
   1087
   1088/* See bdrv_pwrite() for the return codes */
   1089int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int64_t bytes)
   1090{
   1091    int ret;
   1092    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
   1093
   1094    if (bytes < 0) {
   1095        return -EINVAL;
   1096    }
   1097
   1098    ret = bdrv_preadv(child, offset, bytes, &qiov,  0);
   1099
   1100    return ret < 0 ? ret : bytes;
   1101}
   1102
   1103/* Return no. of bytes on success or < 0 on error. Important errors are:
   1104  -EIO         generic I/O error (may happen for all errors)
   1105  -ENOMEDIUM   No media inserted.
   1106  -EINVAL      Invalid offset or number of bytes
   1107  -EACCES      Trying to write a read-only device
   1108*/
   1109int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf,
   1110                int64_t bytes)
   1111{
   1112    int ret;
   1113    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
   1114
   1115    if (bytes < 0) {
   1116        return -EINVAL;
   1117    }
   1118
   1119    ret = bdrv_pwritev(child, offset, bytes, &qiov, 0);
   1120
   1121    return ret < 0 ? ret : bytes;
   1122}
   1123
   1124/*
   1125 * Writes to the file and ensures that no writes are reordered across this
   1126 * request (acts as a barrier)
   1127 *
   1128 * Returns 0 on success, -errno in error cases.
   1129 */
   1130int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
   1131                     const void *buf, int64_t count)
   1132{
   1133    int ret;
   1134
   1135    ret = bdrv_pwrite(child, offset, buf, count);
   1136    if (ret < 0) {
   1137        return ret;
   1138    }
   1139
   1140    ret = bdrv_flush(child->bs);
   1141    if (ret < 0) {
   1142        return ret;
   1143    }
   1144
   1145    return 0;
   1146}
   1147
   1148typedef struct CoroutineIOCompletion {
   1149    Coroutine *coroutine;
   1150    int ret;
   1151} CoroutineIOCompletion;
   1152
   1153static void bdrv_co_io_em_complete(void *opaque, int ret)
   1154{
   1155    CoroutineIOCompletion *co = opaque;
   1156
   1157    co->ret = ret;
   1158    aio_co_wake(co->coroutine);
   1159}
   1160
   1161static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
   1162                                           int64_t offset, int64_t bytes,
   1163                                           QEMUIOVector *qiov,
   1164                                           size_t qiov_offset, int flags)
   1165{
   1166    BlockDriver *drv = bs->drv;
   1167    int64_t sector_num;
   1168    unsigned int nb_sectors;
   1169    QEMUIOVector local_qiov;
   1170    int ret;
   1171
   1172    bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
   1173    assert(!(flags & ~BDRV_REQ_MASK));
   1174    assert(!(flags & BDRV_REQ_NO_FALLBACK));
   1175
   1176    if (!drv) {
   1177        return -ENOMEDIUM;
   1178    }
   1179
   1180    if (drv->bdrv_co_preadv_part) {
   1181        return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset,
   1182                                        flags);
   1183    }
   1184
   1185    if (qiov_offset > 0 || bytes != qiov->size) {
   1186        qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
   1187        qiov = &local_qiov;
   1188    }
   1189
   1190    if (drv->bdrv_co_preadv) {
   1191        ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
   1192        goto out;
   1193    }
   1194
   1195    if (drv->bdrv_aio_preadv) {
   1196        BlockAIOCB *acb;
   1197        CoroutineIOCompletion co = {
   1198            .coroutine = qemu_coroutine_self(),
   1199        };
   1200
   1201        acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
   1202                                   bdrv_co_io_em_complete, &co);
   1203        if (acb == NULL) {
   1204            ret = -EIO;
   1205            goto out;
   1206        } else {
   1207            qemu_coroutine_yield();
   1208            ret = co.ret;
   1209            goto out;
   1210        }
   1211    }
   1212
   1213    sector_num = offset >> BDRV_SECTOR_BITS;
   1214    nb_sectors = bytes >> BDRV_SECTOR_BITS;
   1215
   1216    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
   1217    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
   1218    assert(bytes <= BDRV_REQUEST_MAX_BYTES);
   1219    assert(drv->bdrv_co_readv);
   1220
   1221    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
   1222
   1223out:
   1224    if (qiov == &local_qiov) {
   1225        qemu_iovec_destroy(&local_qiov);
   1226    }
   1227
   1228    return ret;
   1229}
   1230
   1231static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
   1232                                            int64_t offset, int64_t bytes,
   1233                                            QEMUIOVector *qiov,
   1234                                            size_t qiov_offset,
   1235                                            BdrvRequestFlags flags)
   1236{
   1237    BlockDriver *drv = bs->drv;
   1238    int64_t sector_num;
   1239    unsigned int nb_sectors;
   1240    QEMUIOVector local_qiov;
   1241    int ret;
   1242
   1243    bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
   1244    assert(!(flags & ~BDRV_REQ_MASK));
   1245    assert(!(flags & BDRV_REQ_NO_FALLBACK));
   1246
   1247    if (!drv) {
   1248        return -ENOMEDIUM;
   1249    }
   1250
   1251    if (drv->bdrv_co_pwritev_part) {
   1252        ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset,
   1253                                        flags & bs->supported_write_flags);
   1254        flags &= ~bs->supported_write_flags;
   1255        goto emulate_flags;
   1256    }
   1257
   1258    if (qiov_offset > 0 || bytes != qiov->size) {
   1259        qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
   1260        qiov = &local_qiov;
   1261    }
   1262
   1263    if (drv->bdrv_co_pwritev) {
   1264        ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
   1265                                   flags & bs->supported_write_flags);
   1266        flags &= ~bs->supported_write_flags;
   1267        goto emulate_flags;
   1268    }
   1269
   1270    if (drv->bdrv_aio_pwritev) {
   1271        BlockAIOCB *acb;
   1272        CoroutineIOCompletion co = {
   1273            .coroutine = qemu_coroutine_self(),
   1274        };
   1275
   1276        acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
   1277                                    flags & bs->supported_write_flags,
   1278                                    bdrv_co_io_em_complete, &co);
   1279        flags &= ~bs->supported_write_flags;
   1280        if (acb == NULL) {
   1281            ret = -EIO;
   1282        } else {
   1283            qemu_coroutine_yield();
   1284            ret = co.ret;
   1285        }
   1286        goto emulate_flags;
   1287    }
   1288
   1289    sector_num = offset >> BDRV_SECTOR_BITS;
   1290    nb_sectors = bytes >> BDRV_SECTOR_BITS;
   1291
   1292    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
   1293    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
   1294    assert(bytes <= BDRV_REQUEST_MAX_BYTES);
   1295
   1296    assert(drv->bdrv_co_writev);
   1297    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
   1298                              flags & bs->supported_write_flags);
   1299    flags &= ~bs->supported_write_flags;
   1300
   1301emulate_flags:
   1302    if (ret == 0 && (flags & BDRV_REQ_FUA)) {
   1303        ret = bdrv_co_flush(bs);
   1304    }
   1305
   1306    if (qiov == &local_qiov) {
   1307        qemu_iovec_destroy(&local_qiov);
   1308    }
   1309
   1310    return ret;
   1311}
   1312
   1313static int coroutine_fn
   1314bdrv_driver_pwritev_compressed(BlockDriverState *bs, int64_t offset,
   1315                               int64_t bytes, QEMUIOVector *qiov,
   1316                               size_t qiov_offset)
   1317{
   1318    BlockDriver *drv = bs->drv;
   1319    QEMUIOVector local_qiov;
   1320    int ret;
   1321
   1322    bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
   1323
   1324    if (!drv) {
   1325        return -ENOMEDIUM;
   1326    }
   1327
   1328    if (!block_driver_can_compress(drv)) {
   1329        return -ENOTSUP;
   1330    }
   1331
   1332    if (drv->bdrv_co_pwritev_compressed_part) {
   1333        return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes,
   1334                                                    qiov, qiov_offset);
   1335    }
   1336
   1337    if (qiov_offset == 0) {
   1338        return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
   1339    }
   1340
   1341    qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
   1342    ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov);
   1343    qemu_iovec_destroy(&local_qiov);
   1344
   1345    return ret;
   1346}
   1347
   1348static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
   1349        int64_t offset, int64_t bytes, QEMUIOVector *qiov,
   1350        size_t qiov_offset, int flags)
   1351{
   1352    BlockDriverState *bs = child->bs;
   1353
   1354    /* Perform I/O through a temporary buffer so that users who scribble over
   1355     * their read buffer while the operation is in progress do not end up
   1356     * modifying the image file.  This is critical for zero-copy guest I/O
   1357     * where anything might happen inside guest memory.
   1358     */
   1359    void *bounce_buffer = NULL;
   1360
   1361    BlockDriver *drv = bs->drv;
   1362    int64_t cluster_offset;
   1363    int64_t cluster_bytes;
   1364    int64_t skip_bytes;
   1365    int ret;
   1366    int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
   1367                                    BDRV_REQUEST_MAX_BYTES);
   1368    int64_t progress = 0;
   1369    bool skip_write;
   1370
   1371    bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
   1372
   1373    if (!drv) {
   1374        return -ENOMEDIUM;
   1375    }
   1376
   1377    /*
   1378     * Do not write anything when the BDS is inactive.  That is not
   1379     * allowed, and it would not help.
   1380     */
   1381    skip_write = (bs->open_flags & BDRV_O_INACTIVE);
   1382
   1383    /* FIXME We cannot require callers to have write permissions when all they
   1384     * are doing is a read request. If we did things right, write permissions
   1385     * would be obtained anyway, but internally by the copy-on-read code. As
   1386     * long as it is implemented here rather than in a separate filter driver,
   1387     * the copy-on-read code doesn't have its own BdrvChild, however, for which
   1388     * it could request permissions. Therefore we have to bypass the permission
   1389     * system for the moment. */
   1390    // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
   1391
   1392    /* Cover entire cluster so no additional backing file I/O is required when
   1393     * allocating cluster in the image file.  Note that this value may exceed
   1394     * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
   1395     * is one reason we loop rather than doing it all at once.
   1396     */
   1397    bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
   1398    skip_bytes = offset - cluster_offset;
   1399
   1400    trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
   1401                                   cluster_offset, cluster_bytes);
   1402
   1403    while (cluster_bytes) {
   1404        int64_t pnum;
   1405
   1406        if (skip_write) {
   1407            ret = 1; /* "already allocated", so nothing will be copied */
   1408            pnum = MIN(cluster_bytes, max_transfer);
   1409        } else {
   1410            ret = bdrv_is_allocated(bs, cluster_offset,
   1411                                    MIN(cluster_bytes, max_transfer), &pnum);
   1412            if (ret < 0) {
   1413                /*
   1414                 * Safe to treat errors in querying allocation as if
   1415                 * unallocated; we'll probably fail again soon on the
   1416                 * read, but at least that will set a decent errno.
   1417                 */
   1418                pnum = MIN(cluster_bytes, max_transfer);
   1419            }
   1420
   1421            /* Stop at EOF if the image ends in the middle of the cluster */
   1422            if (ret == 0 && pnum == 0) {
   1423                assert(progress >= bytes);
   1424                break;
   1425            }
   1426
   1427            assert(skip_bytes < pnum);
   1428        }
   1429
   1430        if (ret <= 0) {
   1431            QEMUIOVector local_qiov;
   1432
   1433            /* Must copy-on-read; use the bounce buffer */
   1434            pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
   1435            if (!bounce_buffer) {
   1436                int64_t max_we_need = MAX(pnum, cluster_bytes - pnum);
   1437                int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER);
   1438                int64_t bounce_buffer_len = MIN(max_we_need, max_allowed);
   1439
   1440                bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len);
   1441                if (!bounce_buffer) {
   1442                    ret = -ENOMEM;
   1443                    goto err;
   1444                }
   1445            }
   1446            qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
   1447
   1448            ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
   1449                                     &local_qiov, 0, 0);
   1450            if (ret < 0) {
   1451                goto err;
   1452            }
   1453
   1454            bdrv_debug_event(bs, BLKDBG_COR_WRITE);
   1455            if (drv->bdrv_co_pwrite_zeroes &&
   1456                buffer_is_zero(bounce_buffer, pnum)) {
   1457                /* FIXME: Should we (perhaps conditionally) be setting
   1458                 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
   1459                 * that still correctly reads as zero? */
   1460                ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
   1461                                               BDRV_REQ_WRITE_UNCHANGED);
   1462            } else {
   1463                /* This does not change the data on the disk, it is not
   1464                 * necessary to flush even in cache=writethrough mode.
   1465                 */
   1466                ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
   1467                                          &local_qiov, 0,
   1468                                          BDRV_REQ_WRITE_UNCHANGED);
   1469            }
   1470
   1471            if (ret < 0) {
   1472                /* It might be okay to ignore write errors for guest
   1473                 * requests.  If this is a deliberate copy-on-read
   1474                 * then we don't want to ignore the error.  Simply
   1475                 * report it in all cases.
   1476                 */
   1477                goto err;
   1478            }
   1479
   1480            if (!(flags & BDRV_REQ_PREFETCH)) {
   1481                qemu_iovec_from_buf(qiov, qiov_offset + progress,
   1482                                    bounce_buffer + skip_bytes,
   1483                                    MIN(pnum - skip_bytes, bytes - progress));
   1484            }
   1485        } else if (!(flags & BDRV_REQ_PREFETCH)) {
   1486            /* Read directly into the destination */
   1487            ret = bdrv_driver_preadv(bs, offset + progress,
   1488                                     MIN(pnum - skip_bytes, bytes - progress),
   1489                                     qiov, qiov_offset + progress, 0);
   1490            if (ret < 0) {
   1491                goto err;
   1492            }
   1493        }
   1494
   1495        cluster_offset += pnum;
   1496        cluster_bytes -= pnum;
   1497        progress += pnum - skip_bytes;
   1498        skip_bytes = 0;
   1499    }
   1500    ret = 0;
   1501
   1502err:
   1503    qemu_vfree(bounce_buffer);
   1504    return ret;
   1505}
   1506
   1507/*
   1508 * Forwards an already correctly aligned request to the BlockDriver. This
   1509 * handles copy on read, zeroing after EOF, and fragmentation of large
   1510 * reads; any other features must be implemented by the caller.
   1511 */
   1512static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
   1513    BdrvTrackedRequest *req, int64_t offset, int64_t bytes,
   1514    int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags)
   1515{
   1516    BlockDriverState *bs = child->bs;
   1517    int64_t total_bytes, max_bytes;
   1518    int ret = 0;
   1519    int64_t bytes_remaining = bytes;
   1520    int max_transfer;
   1521
   1522    bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
   1523    assert(is_power_of_2(align));
   1524    assert((offset & (align - 1)) == 0);
   1525    assert((bytes & (align - 1)) == 0);
   1526    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
   1527    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
   1528                                   align);
   1529
   1530    /* TODO: We would need a per-BDS .supported_read_flags and
   1531     * potential fallback support, if we ever implement any read flags
   1532     * to pass through to drivers.  For now, there aren't any
   1533     * passthrough flags.  */
   1534    assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH)));
   1535
   1536    /* Handle Copy on Read and associated serialisation */
   1537    if (flags & BDRV_REQ_COPY_ON_READ) {
   1538        /* If we touch the same cluster it counts as an overlap.  This
   1539         * guarantees that allocating writes will be serialized and not race
   1540         * with each other for the same cluster.  For example, in copy-on-read
   1541         * it ensures that the CoR read and write operations are atomic and
   1542         * guest writes cannot interleave between them. */
   1543        bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs));
   1544    } else {
   1545        bdrv_wait_serialising_requests(req);
   1546    }
   1547
   1548    if (flags & BDRV_REQ_COPY_ON_READ) {
   1549        int64_t pnum;
   1550
   1551        /* The flag BDRV_REQ_COPY_ON_READ has reached its addressee */
   1552        flags &= ~BDRV_REQ_COPY_ON_READ;
   1553
   1554        ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
   1555        if (ret < 0) {
   1556            goto out;
   1557        }
   1558
   1559        if (!ret || pnum != bytes) {
   1560            ret = bdrv_co_do_copy_on_readv(child, offset, bytes,
   1561                                           qiov, qiov_offset, flags);
   1562            goto out;
   1563        } else if (flags & BDRV_REQ_PREFETCH) {
   1564            goto out;
   1565        }
   1566    }
   1567
   1568    /* Forward the request to the BlockDriver, possibly fragmenting it */
   1569    total_bytes = bdrv_getlength(bs);
   1570    if (total_bytes < 0) {
   1571        ret = total_bytes;
   1572        goto out;
   1573    }
   1574
   1575    assert(!(flags & ~bs->supported_read_flags));
   1576
   1577    max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
   1578    if (bytes <= max_bytes && bytes <= max_transfer) {
   1579        ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, flags);
   1580        goto out;
   1581    }
   1582
   1583    while (bytes_remaining) {
   1584        int64_t num;
   1585
   1586        if (max_bytes) {
   1587            num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
   1588            assert(num);
   1589
   1590            ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
   1591                                     num, qiov,
   1592                                     qiov_offset + bytes - bytes_remaining,
   1593                                     flags);
   1594            max_bytes -= num;
   1595        } else {
   1596            num = bytes_remaining;
   1597            ret = qemu_iovec_memset(qiov, qiov_offset + bytes - bytes_remaining,
   1598                                    0, bytes_remaining);
   1599        }
   1600        if (ret < 0) {
   1601            goto out;
   1602        }
   1603        bytes_remaining -= num;
   1604    }
   1605
   1606out:
   1607    return ret < 0 ? ret : 0;
   1608}
   1609
   1610/*
   1611 * Request padding
   1612 *
   1613 *  |<---- align ----->|                     |<----- align ---->|
   1614 *  |<- head ->|<------------- bytes ------------->|<-- tail -->|
   1615 *  |          |       |                     |     |            |
   1616 * -*----------$-------*-------- ... --------*-----$------------*---
   1617 *  |          |       |                     |     |            |
   1618 *  |          offset  |                     |     end          |
   1619 *  ALIGN_DOWN(offset) ALIGN_UP(offset)      ALIGN_DOWN(end)   ALIGN_UP(end)
   1620 *  [buf   ... )                             [tail_buf          )
   1621 *
   1622 * @buf is an aligned allocation needed to store @head and @tail paddings. @head
   1623 * is placed at the beginning of @buf and @tail at the @end.
   1624 *
   1625 * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk
   1626 * around tail, if tail exists.
   1627 *
   1628 * @merge_reads is true for small requests,
   1629 * if @buf_len == @head + bytes + @tail. In this case it is possible that both
   1630 * head and tail exist but @buf_len == align and @tail_buf == @buf.
   1631 */
   1632typedef struct BdrvRequestPadding {
   1633    uint8_t *buf;
   1634    size_t buf_len;
   1635    uint8_t *tail_buf;
   1636    size_t head;
   1637    size_t tail;
   1638    bool merge_reads;
   1639    QEMUIOVector local_qiov;
   1640} BdrvRequestPadding;
   1641
   1642static bool bdrv_init_padding(BlockDriverState *bs,
   1643                              int64_t offset, int64_t bytes,
   1644                              BdrvRequestPadding *pad)
   1645{
   1646    int64_t align = bs->bl.request_alignment;
   1647    int64_t sum;
   1648
   1649    bdrv_check_request(offset, bytes, &error_abort);
   1650    assert(align <= INT_MAX); /* documented in block/block_int.h */
   1651    assert(align <= SIZE_MAX / 2); /* so we can allocate the buffer */
   1652
   1653    memset(pad, 0, sizeof(*pad));
   1654
   1655    pad->head = offset & (align - 1);
   1656    pad->tail = ((offset + bytes) & (align - 1));
   1657    if (pad->tail) {
   1658        pad->tail = align - pad->tail;
   1659    }
   1660
   1661    if (!pad->head && !pad->tail) {
   1662        return false;
   1663    }
   1664
   1665    assert(bytes); /* Nothing good in aligning zero-length requests */
   1666
   1667    sum = pad->head + bytes + pad->tail;
   1668    pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align;
   1669    pad->buf = qemu_blockalign(bs, pad->buf_len);
   1670    pad->merge_reads = sum == pad->buf_len;
   1671    if (pad->tail) {
   1672        pad->tail_buf = pad->buf + pad->buf_len - align;
   1673    }
   1674
   1675    return true;
   1676}
   1677
   1678static int bdrv_padding_rmw_read(BdrvChild *child,
   1679                                 BdrvTrackedRequest *req,
   1680                                 BdrvRequestPadding *pad,
   1681                                 bool zero_middle)
   1682{
   1683    QEMUIOVector local_qiov;
   1684    BlockDriverState *bs = child->bs;
   1685    uint64_t align = bs->bl.request_alignment;
   1686    int ret;
   1687
   1688    assert(req->serialising && pad->buf);
   1689
   1690    if (pad->head || pad->merge_reads) {
   1691        int64_t bytes = pad->merge_reads ? pad->buf_len : align;
   1692
   1693        qemu_iovec_init_buf(&local_qiov, pad->buf, bytes);
   1694
   1695        if (pad->head) {
   1696            bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
   1697        }
   1698        if (pad->merge_reads && pad->tail) {
   1699            bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
   1700        }
   1701        ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes,
   1702                                  align, &local_qiov, 0, 0);
   1703        if (ret < 0) {
   1704            return ret;
   1705        }
   1706        if (pad->head) {
   1707            bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
   1708        }
   1709        if (pad->merge_reads && pad->tail) {
   1710            bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
   1711        }
   1712
   1713        if (pad->merge_reads) {
   1714            goto zero_mem;
   1715        }
   1716    }
   1717
   1718    if (pad->tail) {
   1719        qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align);
   1720
   1721        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
   1722        ret = bdrv_aligned_preadv(
   1723                child, req,
   1724                req->overlap_offset + req->overlap_bytes - align,
   1725                align, align, &local_qiov, 0, 0);
   1726        if (ret < 0) {
   1727            return ret;
   1728        }
   1729        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
   1730    }
   1731
   1732zero_mem:
   1733    if (zero_middle) {
   1734        memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail);
   1735    }
   1736
   1737    return 0;
   1738}
   1739
   1740static void bdrv_padding_destroy(BdrvRequestPadding *pad)
   1741{
   1742    if (pad->buf) {
   1743        qemu_vfree(pad->buf);
   1744        qemu_iovec_destroy(&pad->local_qiov);
   1745    }
   1746    memset(pad, 0, sizeof(*pad));
   1747}
   1748
   1749/*
   1750 * bdrv_pad_request
   1751 *
   1752 * Exchange request parameters with padded request if needed. Don't include RMW
   1753 * read of padding, bdrv_padding_rmw_read() should be called separately if
   1754 * needed.
   1755 *
   1756 * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out:
   1757 *  - on function start they represent original request
   1758 *  - on failure or when padding is not needed they are unchanged
   1759 *  - on success when padding is needed they represent padded request
   1760 */
   1761static int bdrv_pad_request(BlockDriverState *bs,
   1762                            QEMUIOVector **qiov, size_t *qiov_offset,
   1763                            int64_t *offset, int64_t *bytes,
   1764                            BdrvRequestPadding *pad, bool *padded)
   1765{
   1766    int ret;
   1767
   1768    bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort);
   1769
   1770    if (!bdrv_init_padding(bs, *offset, *bytes, pad)) {
   1771        if (padded) {
   1772            *padded = false;
   1773        }
   1774        return 0;
   1775    }
   1776
   1777    ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head,
   1778                                   *qiov, *qiov_offset, *bytes,
   1779                                   pad->buf + pad->buf_len - pad->tail,
   1780                                   pad->tail);
   1781    if (ret < 0) {
   1782        bdrv_padding_destroy(pad);
   1783        return ret;
   1784    }
   1785    *bytes += pad->head + pad->tail;
   1786    *offset -= pad->head;
   1787    *qiov = &pad->local_qiov;
   1788    *qiov_offset = 0;
   1789    if (padded) {
   1790        *padded = true;
   1791    }
   1792
   1793    return 0;
   1794}
   1795
   1796int coroutine_fn bdrv_co_preadv(BdrvChild *child,
   1797    int64_t offset, int64_t bytes, QEMUIOVector *qiov,
   1798    BdrvRequestFlags flags)
   1799{
   1800    return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags);
   1801}
   1802
   1803int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
   1804    int64_t offset, int64_t bytes,
   1805    QEMUIOVector *qiov, size_t qiov_offset,
   1806    BdrvRequestFlags flags)
   1807{
   1808    BlockDriverState *bs = child->bs;
   1809    BdrvTrackedRequest req;
   1810    BdrvRequestPadding pad;
   1811    int ret;
   1812
   1813    trace_bdrv_co_preadv_part(bs, offset, bytes, flags);
   1814
   1815    if (!bdrv_is_inserted(bs)) {
   1816        return -ENOMEDIUM;
   1817    }
   1818
   1819    ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset);
   1820    if (ret < 0) {
   1821        return ret;
   1822    }
   1823
   1824    if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
   1825        /*
   1826         * Aligning zero request is nonsense. Even if driver has special meaning
   1827         * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
   1828         * it to driver due to request_alignment.
   1829         *
   1830         * Still, no reason to return an error if someone do unaligned
   1831         * zero-length read occasionally.
   1832         */
   1833        return 0;
   1834    }
   1835
   1836    bdrv_inc_in_flight(bs);
   1837
   1838    /* Don't do copy-on-read if we read data before write operation */
   1839    if (qatomic_read(&bs->copy_on_read)) {
   1840        flags |= BDRV_REQ_COPY_ON_READ;
   1841    }
   1842
   1843    ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
   1844                           NULL);
   1845    if (ret < 0) {
   1846        goto fail;
   1847    }
   1848
   1849    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
   1850    ret = bdrv_aligned_preadv(child, &req, offset, bytes,
   1851                              bs->bl.request_alignment,
   1852                              qiov, qiov_offset, flags);
   1853    tracked_request_end(&req);
   1854    bdrv_padding_destroy(&pad);
   1855
   1856fail:
   1857    bdrv_dec_in_flight(bs);
   1858
   1859    return ret;
   1860}
   1861
   1862static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
   1863    int64_t offset, int64_t bytes, BdrvRequestFlags flags)
   1864{
   1865    BlockDriver *drv = bs->drv;
   1866    QEMUIOVector qiov;
   1867    void *buf = NULL;
   1868    int ret = 0;
   1869    bool need_flush = false;
   1870    int head = 0;
   1871    int tail = 0;
   1872
   1873    int64_t max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes,
   1874                                            INT64_MAX);
   1875    int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
   1876                        bs->bl.request_alignment);
   1877    int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
   1878
   1879    bdrv_check_request(offset, bytes, &error_abort);
   1880
   1881    if (!drv) {
   1882        return -ENOMEDIUM;
   1883    }
   1884
   1885    if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) {
   1886        return -ENOTSUP;
   1887    }
   1888
   1889    /* Invalidate the cached block-status data range if this write overlaps */
   1890    bdrv_bsc_invalidate_range(bs, offset, bytes);
   1891
   1892    assert(alignment % bs->bl.request_alignment == 0);
   1893    head = offset % alignment;
   1894    tail = (offset + bytes) % alignment;
   1895    max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
   1896    assert(max_write_zeroes >= bs->bl.request_alignment);
   1897
   1898    while (bytes > 0 && !ret) {
   1899        int64_t num = bytes;
   1900
   1901        /* Align request.  Block drivers can expect the "bulk" of the request
   1902         * to be aligned, and that unaligned requests do not cross cluster
   1903         * boundaries.
   1904         */
   1905        if (head) {
   1906            /* Make a small request up to the first aligned sector. For
   1907             * convenience, limit this request to max_transfer even if
   1908             * we don't need to fall back to writes.  */
   1909            num = MIN(MIN(bytes, max_transfer), alignment - head);
   1910            head = (head + num) % alignment;
   1911            assert(num < max_write_zeroes);
   1912        } else if (tail && num > alignment) {
   1913            /* Shorten the request to the last aligned sector.  */
   1914            num -= tail;
   1915        }
   1916
   1917        /* limit request size */
   1918        if (num > max_write_zeroes) {
   1919            num = max_write_zeroes;
   1920        }
   1921
   1922        ret = -ENOTSUP;
   1923        /* First try the efficient write zeroes operation */
   1924        if (drv->bdrv_co_pwrite_zeroes) {
   1925            ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
   1926                                             flags & bs->supported_zero_flags);
   1927            if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
   1928                !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
   1929                need_flush = true;
   1930            }
   1931        } else {
   1932            assert(!bs->supported_zero_flags);
   1933        }
   1934
   1935        if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) {
   1936            /* Fall back to bounce buffer if write zeroes is unsupported */
   1937            BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
   1938
   1939            if ((flags & BDRV_REQ_FUA) &&
   1940                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
   1941                /* No need for bdrv_driver_pwrite() to do a fallback
   1942                 * flush on each chunk; use just one at the end */
   1943                write_flags &= ~BDRV_REQ_FUA;
   1944                need_flush = true;
   1945            }
   1946            num = MIN(num, max_transfer);
   1947            if (buf == NULL) {
   1948                buf = qemu_try_blockalign0(bs, num);
   1949                if (buf == NULL) {
   1950                    ret = -ENOMEM;
   1951                    goto fail;
   1952                }
   1953            }
   1954            qemu_iovec_init_buf(&qiov, buf, num);
   1955
   1956            ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags);
   1957
   1958            /* Keep bounce buffer around if it is big enough for all
   1959             * all future requests.
   1960             */
   1961            if (num < max_transfer) {
   1962                qemu_vfree(buf);
   1963                buf = NULL;
   1964            }
   1965        }
   1966
   1967        offset += num;
   1968        bytes -= num;
   1969    }
   1970
   1971fail:
   1972    if (ret == 0 && need_flush) {
   1973        ret = bdrv_co_flush(bs);
   1974    }
   1975    qemu_vfree(buf);
   1976    return ret;
   1977}
   1978
   1979static inline int coroutine_fn
   1980bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
   1981                          BdrvTrackedRequest *req, int flags)
   1982{
   1983    BlockDriverState *bs = child->bs;
   1984
   1985    bdrv_check_request(offset, bytes, &error_abort);
   1986
   1987    if (bdrv_is_read_only(bs)) {
   1988        return -EPERM;
   1989    }
   1990
   1991    assert(!(bs->open_flags & BDRV_O_INACTIVE));
   1992    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
   1993    assert(!(flags & ~BDRV_REQ_MASK));
   1994    assert(!((flags & BDRV_REQ_NO_WAIT) && !(flags & BDRV_REQ_SERIALISING)));
   1995
   1996    if (flags & BDRV_REQ_SERIALISING) {
   1997        QEMU_LOCK_GUARD(&bs->reqs_lock);
   1998
   1999        tracked_request_set_serialising(req, bdrv_get_cluster_size(bs));
   2000
   2001        if ((flags & BDRV_REQ_NO_WAIT) && bdrv_find_conflicting_request(req)) {
   2002            return -EBUSY;
   2003        }
   2004
   2005        bdrv_wait_serialising_requests_locked(req);
   2006    } else {
   2007        bdrv_wait_serialising_requests(req);
   2008    }
   2009
   2010    assert(req->overlap_offset <= offset);
   2011    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
   2012    assert(offset + bytes <= bs->total_sectors * BDRV_SECTOR_SIZE ||
   2013           child->perm & BLK_PERM_RESIZE);
   2014
   2015    switch (req->type) {
   2016    case BDRV_TRACKED_WRITE:
   2017    case BDRV_TRACKED_DISCARD:
   2018        if (flags & BDRV_REQ_WRITE_UNCHANGED) {
   2019            assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
   2020        } else {
   2021            assert(child->perm & BLK_PERM_WRITE);
   2022        }
   2023        bdrv_write_threshold_check_write(bs, offset, bytes);
   2024        return 0;
   2025    case BDRV_TRACKED_TRUNCATE:
   2026        assert(child->perm & BLK_PERM_RESIZE);
   2027        return 0;
   2028    default:
   2029        abort();
   2030    }
   2031}
   2032
   2033static inline void coroutine_fn
   2034bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, int64_t bytes,
   2035                         BdrvTrackedRequest *req, int ret)
   2036{
   2037    int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
   2038    BlockDriverState *bs = child->bs;
   2039
   2040    bdrv_check_request(offset, bytes, &error_abort);
   2041
   2042    qatomic_inc(&bs->write_gen);
   2043
   2044    /*
   2045     * Discard cannot extend the image, but in error handling cases, such as
   2046     * when reverting a qcow2 cluster allocation, the discarded range can pass
   2047     * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD
   2048     * here. Instead, just skip it, since semantically a discard request
   2049     * beyond EOF cannot expand the image anyway.
   2050     */
   2051    if (ret == 0 &&
   2052        (req->type == BDRV_TRACKED_TRUNCATE ||
   2053         end_sector > bs->total_sectors) &&
   2054        req->type != BDRV_TRACKED_DISCARD) {
   2055        bs->total_sectors = end_sector;
   2056        bdrv_parent_cb_resize(bs);
   2057        bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS);
   2058    }
   2059    if (req->bytes) {
   2060        switch (req->type) {
   2061        case BDRV_TRACKED_WRITE:
   2062            stat64_max(&bs->wr_highest_offset, offset + bytes);
   2063            /* fall through, to set dirty bits */
   2064        case BDRV_TRACKED_DISCARD:
   2065            bdrv_set_dirty(bs, offset, bytes);
   2066            break;
   2067        default:
   2068            break;
   2069        }
   2070    }
   2071}
   2072
   2073/*
   2074 * Forwards an already correctly aligned write request to the BlockDriver,
   2075 * after possibly fragmenting it.
   2076 */
   2077static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
   2078    BdrvTrackedRequest *req, int64_t offset, int64_t bytes,
   2079    int64_t align, QEMUIOVector *qiov, size_t qiov_offset,
   2080    BdrvRequestFlags flags)
   2081{
   2082    BlockDriverState *bs = child->bs;
   2083    BlockDriver *drv = bs->drv;
   2084    int ret;
   2085
   2086    int64_t bytes_remaining = bytes;
   2087    int max_transfer;
   2088
   2089    bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
   2090
   2091    if (!drv) {
   2092        return -ENOMEDIUM;
   2093    }
   2094
   2095    if (bdrv_has_readonly_bitmaps(bs)) {
   2096        return -EPERM;
   2097    }
   2098
   2099    assert(is_power_of_2(align));
   2100    assert((offset & (align - 1)) == 0);
   2101    assert((bytes & (align - 1)) == 0);
   2102    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
   2103                                   align);
   2104
   2105    ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags);
   2106
   2107    if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
   2108        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
   2109        qemu_iovec_is_zero(qiov, qiov_offset, bytes)) {
   2110        flags |= BDRV_REQ_ZERO_WRITE;
   2111        if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
   2112            flags |= BDRV_REQ_MAY_UNMAP;
   2113        }
   2114    }
   2115
   2116    if (ret < 0) {
   2117        /* Do nothing, write notifier decided to fail this request */
   2118    } else if (flags & BDRV_REQ_ZERO_WRITE) {
   2119        bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
   2120        ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
   2121    } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
   2122        ret = bdrv_driver_pwritev_compressed(bs, offset, bytes,
   2123                                             qiov, qiov_offset);
   2124    } else if (bytes <= max_transfer) {
   2125        bdrv_debug_event(bs, BLKDBG_PWRITEV);
   2126        ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags);
   2127    } else {
   2128        bdrv_debug_event(bs, BLKDBG_PWRITEV);
   2129        while (bytes_remaining) {
   2130            int num = MIN(bytes_remaining, max_transfer);
   2131            int local_flags = flags;
   2132
   2133            assert(num);
   2134            if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
   2135                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
   2136                /* If FUA is going to be emulated by flush, we only
   2137                 * need to flush on the last iteration */
   2138                local_flags &= ~BDRV_REQ_FUA;
   2139            }
   2140
   2141            ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
   2142                                      num, qiov,
   2143                                      qiov_offset + bytes - bytes_remaining,
   2144                                      local_flags);
   2145            if (ret < 0) {
   2146                break;
   2147            }
   2148            bytes_remaining -= num;
   2149        }
   2150    }
   2151    bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
   2152
   2153    if (ret >= 0) {
   2154        ret = 0;
   2155    }
   2156    bdrv_co_write_req_finish(child, offset, bytes, req, ret);
   2157
   2158    return ret;
   2159}
   2160
   2161static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
   2162                                                int64_t offset,
   2163                                                int64_t bytes,
   2164                                                BdrvRequestFlags flags,
   2165                                                BdrvTrackedRequest *req)
   2166{
   2167    BlockDriverState *bs = child->bs;
   2168    QEMUIOVector local_qiov;
   2169    uint64_t align = bs->bl.request_alignment;
   2170    int ret = 0;
   2171    bool padding;
   2172    BdrvRequestPadding pad;
   2173
   2174    padding = bdrv_init_padding(bs, offset, bytes, &pad);
   2175    if (padding) {
   2176        bdrv_make_request_serialising(req, align);
   2177
   2178        bdrv_padding_rmw_read(child, req, &pad, true);
   2179
   2180        if (pad.head || pad.merge_reads) {
   2181            int64_t aligned_offset = offset & ~(align - 1);
   2182            int64_t write_bytes = pad.merge_reads ? pad.buf_len : align;
   2183
   2184            qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes);
   2185            ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes,
   2186                                       align, &local_qiov, 0,
   2187                                       flags & ~BDRV_REQ_ZERO_WRITE);
   2188            if (ret < 0 || pad.merge_reads) {
   2189                /* Error or all work is done */
   2190                goto out;
   2191            }
   2192            offset += write_bytes - pad.head;
   2193            bytes -= write_bytes - pad.head;
   2194        }
   2195    }
   2196
   2197    assert(!bytes || (offset & (align - 1)) == 0);
   2198    if (bytes >= align) {
   2199        /* Write the aligned part in the middle. */
   2200        int64_t aligned_bytes = bytes & ~(align - 1);
   2201        ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
   2202                                   NULL, 0, flags);
   2203        if (ret < 0) {
   2204            goto out;
   2205        }
   2206        bytes -= aligned_bytes;
   2207        offset += aligned_bytes;
   2208    }
   2209
   2210    assert(!bytes || (offset & (align - 1)) == 0);
   2211    if (bytes) {
   2212        assert(align == pad.tail + bytes);
   2213
   2214        qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align);
   2215        ret = bdrv_aligned_pwritev(child, req, offset, align, align,
   2216                                   &local_qiov, 0,
   2217                                   flags & ~BDRV_REQ_ZERO_WRITE);
   2218    }
   2219
   2220out:
   2221    bdrv_padding_destroy(&pad);
   2222
   2223    return ret;
   2224}
   2225
   2226/*
   2227 * Handle a write request in coroutine context
   2228 */
   2229int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
   2230    int64_t offset, int64_t bytes, QEMUIOVector *qiov,
   2231    BdrvRequestFlags flags)
   2232{
   2233    return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags);
   2234}
   2235
   2236int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
   2237    int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset,
   2238    BdrvRequestFlags flags)
   2239{
   2240    BlockDriverState *bs = child->bs;
   2241    BdrvTrackedRequest req;
   2242    uint64_t align = bs->bl.request_alignment;
   2243    BdrvRequestPadding pad;
   2244    int ret;
   2245    bool padded = false;
   2246
   2247    trace_bdrv_co_pwritev_part(child->bs, offset, bytes, flags);
   2248
   2249    if (!bdrv_is_inserted(bs)) {
   2250        return -ENOMEDIUM;
   2251    }
   2252
   2253    if (flags & BDRV_REQ_ZERO_WRITE) {
   2254        ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL);
   2255    } else {
   2256        ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset);
   2257    }
   2258    if (ret < 0) {
   2259        return ret;
   2260    }
   2261
   2262    /* If the request is misaligned then we can't make it efficient */
   2263    if ((flags & BDRV_REQ_NO_FALLBACK) &&
   2264        !QEMU_IS_ALIGNED(offset | bytes, align))
   2265    {
   2266        return -ENOTSUP;
   2267    }
   2268
   2269    if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
   2270        /*
   2271         * Aligning zero request is nonsense. Even if driver has special meaning
   2272         * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
   2273         * it to driver due to request_alignment.
   2274         *
   2275         * Still, no reason to return an error if someone do unaligned
   2276         * zero-length write occasionally.
   2277         */
   2278        return 0;
   2279    }
   2280
   2281    if (!(flags & BDRV_REQ_ZERO_WRITE)) {
   2282        /*
   2283         * Pad request for following read-modify-write cycle.
   2284         * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do
   2285         * alignment only if there is no ZERO flag.
   2286         */
   2287        ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
   2288                               &padded);
   2289        if (ret < 0) {
   2290            return ret;
   2291        }
   2292    }
   2293
   2294    bdrv_inc_in_flight(bs);
   2295    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
   2296
   2297    if (flags & BDRV_REQ_ZERO_WRITE) {
   2298        assert(!padded);
   2299        ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
   2300        goto out;
   2301    }
   2302
   2303    if (padded) {
   2304        /*
   2305         * Request was unaligned to request_alignment and therefore
   2306         * padded.  We are going to do read-modify-write, and must
   2307         * serialize the request to prevent interactions of the
   2308         * widened region with other transactions.
   2309         */
   2310        bdrv_make_request_serialising(&req, align);
   2311        bdrv_padding_rmw_read(child, &req, &pad, false);
   2312    }
   2313
   2314    ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
   2315                               qiov, qiov_offset, flags);
   2316
   2317    bdrv_padding_destroy(&pad);
   2318
   2319out:
   2320    tracked_request_end(&req);
   2321    bdrv_dec_in_flight(bs);
   2322
   2323    return ret;
   2324}
   2325
   2326int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
   2327                                       int64_t bytes, BdrvRequestFlags flags)
   2328{
   2329    trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
   2330
   2331    if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
   2332        flags &= ~BDRV_REQ_MAY_UNMAP;
   2333    }
   2334
   2335    return bdrv_co_pwritev(child, offset, bytes, NULL,
   2336                           BDRV_REQ_ZERO_WRITE | flags);
   2337}
   2338
   2339/*
   2340 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
   2341 */
   2342int bdrv_flush_all(void)
   2343{
   2344    BdrvNextIterator it;
   2345    BlockDriverState *bs = NULL;
   2346    int result = 0;
   2347
   2348    /*
   2349     * bdrv queue is managed by record/replay,
   2350     * creating new flush request for stopping
   2351     * the VM may break the determinism
   2352     */
   2353    if (replay_events_enabled()) {
   2354        return result;
   2355    }
   2356
   2357    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
   2358        AioContext *aio_context = bdrv_get_aio_context(bs);
   2359        int ret;
   2360
   2361        aio_context_acquire(aio_context);
   2362        ret = bdrv_flush(bs);
   2363        if (ret < 0 && !result) {
   2364            result = ret;
   2365        }
   2366        aio_context_release(aio_context);
   2367    }
   2368
   2369    return result;
   2370}
   2371
   2372/*
   2373 * Returns the allocation status of the specified sectors.
   2374 * Drivers not implementing the functionality are assumed to not support
   2375 * backing files, hence all their sectors are reported as allocated.
   2376 *
   2377 * If 'want_zero' is true, the caller is querying for mapping
   2378 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
   2379 * _ZERO where possible; otherwise, the result favors larger 'pnum',
   2380 * with a focus on accurate BDRV_BLOCK_ALLOCATED.
   2381 *
   2382 * If 'offset' is beyond the end of the disk image the return value is
   2383 * BDRV_BLOCK_EOF and 'pnum' is set to 0.
   2384 *
   2385 * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
   2386 * beyond the end of the disk image it will be clamped; if 'pnum' is set to
   2387 * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
   2388 *
   2389 * 'pnum' is set to the number of bytes (including and immediately
   2390 * following the specified offset) that are easily known to be in the
   2391 * same allocated/unallocated state.  Note that a second call starting
   2392 * at the original offset plus returned pnum may have the same status.
   2393 * The returned value is non-zero on success except at end-of-file.
   2394 *
   2395 * Returns negative errno on failure.  Otherwise, if the
   2396 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
   2397 * set to the host mapping and BDS corresponding to the guest offset.
   2398 */
   2399static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
   2400                                             bool want_zero,
   2401                                             int64_t offset, int64_t bytes,
   2402                                             int64_t *pnum, int64_t *map,
   2403                                             BlockDriverState **file)
   2404{
   2405    int64_t total_size;
   2406    int64_t n; /* bytes */
   2407    int ret;
   2408    int64_t local_map = 0;
   2409    BlockDriverState *local_file = NULL;
   2410    int64_t aligned_offset, aligned_bytes;
   2411    uint32_t align;
   2412    bool has_filtered_child;
   2413
   2414    assert(pnum);
   2415    *pnum = 0;
   2416    total_size = bdrv_getlength(bs);
   2417    if (total_size < 0) {
   2418        ret = total_size;
   2419        goto early_out;
   2420    }
   2421
   2422    if (offset >= total_size) {
   2423        ret = BDRV_BLOCK_EOF;
   2424        goto early_out;
   2425    }
   2426    if (!bytes) {
   2427        ret = 0;
   2428        goto early_out;
   2429    }
   2430
   2431    n = total_size - offset;
   2432    if (n < bytes) {
   2433        bytes = n;
   2434    }
   2435
   2436    /* Must be non-NULL or bdrv_getlength() would have failed */
   2437    assert(bs->drv);
   2438    has_filtered_child = bdrv_filter_child(bs);
   2439    if (!bs->drv->bdrv_co_block_status && !has_filtered_child) {
   2440        *pnum = bytes;
   2441        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
   2442        if (offset + bytes == total_size) {
   2443            ret |= BDRV_BLOCK_EOF;
   2444        }
   2445        if (bs->drv->protocol_name) {
   2446            ret |= BDRV_BLOCK_OFFSET_VALID;
   2447            local_map = offset;
   2448            local_file = bs;
   2449        }
   2450        goto early_out;
   2451    }
   2452
   2453    bdrv_inc_in_flight(bs);
   2454
   2455    /* Round out to request_alignment boundaries */
   2456    align = bs->bl.request_alignment;
   2457    aligned_offset = QEMU_ALIGN_DOWN(offset, align);
   2458    aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
   2459
   2460    if (bs->drv->bdrv_co_block_status) {
   2461        /*
   2462         * Use the block-status cache only for protocol nodes: Format
   2463         * drivers are generally quick to inquire the status, but protocol
   2464         * drivers often need to get information from outside of qemu, so
   2465         * we do not have control over the actual implementation.  There
   2466         * have been cases where inquiring the status took an unreasonably
   2467         * long time, and we can do nothing in qemu to fix it.
   2468         * This is especially problematic for images with large data areas,
   2469         * because finding the few holes in them and giving them special
   2470         * treatment does not gain much performance.  Therefore, we try to
   2471         * cache the last-identified data region.
   2472         *
   2473         * Second, limiting ourselves to protocol nodes allows us to assume
   2474         * the block status for data regions to be DATA | OFFSET_VALID, and
   2475         * that the host offset is the same as the guest offset.
   2476         *
   2477         * Note that it is possible that external writers zero parts of
   2478         * the cached regions without the cache being invalidated, and so
   2479         * we may report zeroes as data.  This is not catastrophic,
   2480         * however, because reporting zeroes as data is fine.
   2481         */
   2482        if (QLIST_EMPTY(&bs->children) &&
   2483            bdrv_bsc_is_data(bs, aligned_offset, pnum))
   2484        {
   2485            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
   2486            local_file = bs;
   2487            local_map = aligned_offset;
   2488        } else {
   2489            ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
   2490                                                aligned_bytes, pnum, &local_map,
   2491                                                &local_file);
   2492
   2493            /*
   2494             * Note that checking QLIST_EMPTY(&bs->children) is also done when
   2495             * the cache is queried above.  Technically, we do not need to check
   2496             * it here; the worst that can happen is that we fill the cache for
   2497             * non-protocol nodes, and then it is never used.  However, filling
   2498             * the cache requires an RCU update, so double check here to avoid
   2499             * such an update if possible.
   2500             */
   2501            if (ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) &&
   2502                QLIST_EMPTY(&bs->children))
   2503            {
   2504                /*
   2505                 * When a protocol driver reports BLOCK_OFFSET_VALID, the
   2506                 * returned local_map value must be the same as the offset we
   2507                 * have passed (aligned_offset), and local_bs must be the node
   2508                 * itself.
   2509                 * Assert this, because we follow this rule when reading from
   2510                 * the cache (see the `local_file = bs` and
   2511                 * `local_map = aligned_offset` assignments above), and the
   2512                 * result the cache delivers must be the same as the driver
   2513                 * would deliver.
   2514                 */
   2515                assert(local_file == bs);
   2516                assert(local_map == aligned_offset);
   2517                bdrv_bsc_fill(bs, aligned_offset, *pnum);
   2518            }
   2519        }
   2520    } else {
   2521        /* Default code for filters */
   2522
   2523        local_file = bdrv_filter_bs(bs);
   2524        assert(local_file);
   2525
   2526        *pnum = aligned_bytes;
   2527        local_map = aligned_offset;
   2528        ret = BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
   2529    }
   2530    if (ret < 0) {
   2531        *pnum = 0;
   2532        goto out;
   2533    }
   2534
   2535    /*
   2536     * The driver's result must be a non-zero multiple of request_alignment.
   2537     * Clamp pnum and adjust map to original request.
   2538     */
   2539    assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
   2540           align > offset - aligned_offset);
   2541    if (ret & BDRV_BLOCK_RECURSE) {
   2542        assert(ret & BDRV_BLOCK_DATA);
   2543        assert(ret & BDRV_BLOCK_OFFSET_VALID);
   2544        assert(!(ret & BDRV_BLOCK_ZERO));
   2545    }
   2546
   2547    *pnum -= offset - aligned_offset;
   2548    if (*pnum > bytes) {
   2549        *pnum = bytes;
   2550    }
   2551    if (ret & BDRV_BLOCK_OFFSET_VALID) {
   2552        local_map += offset - aligned_offset;
   2553    }
   2554
   2555    if (ret & BDRV_BLOCK_RAW) {
   2556        assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
   2557        ret = bdrv_co_block_status(local_file, want_zero, local_map,
   2558                                   *pnum, pnum, &local_map, &local_file);
   2559        goto out;
   2560    }
   2561
   2562    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
   2563        ret |= BDRV_BLOCK_ALLOCATED;
   2564    } else if (bs->drv->supports_backing) {
   2565        BlockDriverState *cow_bs = bdrv_cow_bs(bs);
   2566
   2567        if (!cow_bs) {
   2568            ret |= BDRV_BLOCK_ZERO;
   2569        } else if (want_zero) {
   2570            int64_t size2 = bdrv_getlength(cow_bs);
   2571
   2572            if (size2 >= 0 && offset >= size2) {
   2573                ret |= BDRV_BLOCK_ZERO;
   2574            }
   2575        }
   2576    }
   2577
   2578    if (want_zero && ret & BDRV_BLOCK_RECURSE &&
   2579        local_file && local_file != bs &&
   2580        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
   2581        (ret & BDRV_BLOCK_OFFSET_VALID)) {
   2582        int64_t file_pnum;
   2583        int ret2;
   2584
   2585        ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
   2586                                    *pnum, &file_pnum, NULL, NULL);
   2587        if (ret2 >= 0) {
   2588            /* Ignore errors.  This is just providing extra information, it
   2589             * is useful but not necessary.
   2590             */
   2591            if (ret2 & BDRV_BLOCK_EOF &&
   2592                (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
   2593                /*
   2594                 * It is valid for the format block driver to read
   2595                 * beyond the end of the underlying file's current
   2596                 * size; such areas read as zero.
   2597                 */
   2598                ret |= BDRV_BLOCK_ZERO;
   2599            } else {
   2600                /* Limit request to the range reported by the protocol driver */
   2601                *pnum = file_pnum;
   2602                ret |= (ret2 & BDRV_BLOCK_ZERO);
   2603            }
   2604        }
   2605    }
   2606
   2607out:
   2608    bdrv_dec_in_flight(bs);
   2609    if (ret >= 0 && offset + *pnum == total_size) {
   2610        ret |= BDRV_BLOCK_EOF;
   2611    }
   2612early_out:
   2613    if (file) {
   2614        *file = local_file;
   2615    }
   2616    if (map) {
   2617        *map = local_map;
   2618    }
   2619    return ret;
   2620}
   2621
   2622int coroutine_fn
   2623bdrv_co_common_block_status_above(BlockDriverState *bs,
   2624                                  BlockDriverState *base,
   2625                                  bool include_base,
   2626                                  bool want_zero,
   2627                                  int64_t offset,
   2628                                  int64_t bytes,
   2629                                  int64_t *pnum,
   2630                                  int64_t *map,
   2631                                  BlockDriverState **file,
   2632                                  int *depth)
   2633{
   2634    int ret;
   2635    BlockDriverState *p;
   2636    int64_t eof = 0;
   2637    int dummy;
   2638
   2639    assert(!include_base || base); /* Can't include NULL base */
   2640
   2641    if (!depth) {
   2642        depth = &dummy;
   2643    }
   2644    *depth = 0;
   2645
   2646    if (!include_base && bs == base) {
   2647        *pnum = bytes;
   2648        return 0;
   2649    }
   2650
   2651    ret = bdrv_co_block_status(bs, want_zero, offset, bytes, pnum, map, file);
   2652    ++*depth;
   2653    if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) {
   2654        return ret;
   2655    }
   2656
   2657    if (ret & BDRV_BLOCK_EOF) {
   2658        eof = offset + *pnum;
   2659    }
   2660
   2661    assert(*pnum <= bytes);
   2662    bytes = *pnum;
   2663
   2664    for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base;
   2665         p = bdrv_filter_or_cow_bs(p))
   2666    {
   2667        ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
   2668                                   file);
   2669        ++*depth;
   2670        if (ret < 0) {
   2671            return ret;
   2672        }
   2673        if (*pnum == 0) {
   2674            /*
   2675             * The top layer deferred to this layer, and because this layer is
   2676             * short, any zeroes that we synthesize beyond EOF behave as if they
   2677             * were allocated at this layer.
   2678             *
   2679             * We don't include BDRV_BLOCK_EOF into ret, as upper layer may be
   2680             * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see
   2681             * below.
   2682             */
   2683            assert(ret & BDRV_BLOCK_EOF);
   2684            *pnum = bytes;
   2685            if (file) {
   2686                *file = p;
   2687            }
   2688            ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_ALLOCATED;
   2689            break;
   2690        }
   2691        if (ret & BDRV_BLOCK_ALLOCATED) {
   2692            /*
   2693             * We've found the node and the status, we must break.
   2694             *
   2695             * Drop BDRV_BLOCK_EOF, as it's not for upper layer, which may be
   2696             * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see
   2697             * below.
   2698             */
   2699            ret &= ~BDRV_BLOCK_EOF;
   2700            break;
   2701        }
   2702
   2703        if (p == base) {
   2704            assert(include_base);
   2705            break;
   2706        }
   2707
   2708        /*
   2709         * OK, [offset, offset + *pnum) region is unallocated on this layer,
   2710         * let's continue the diving.
   2711         */
   2712        assert(*pnum <= bytes);
   2713        bytes = *pnum;
   2714    }
   2715
   2716    if (offset + *pnum == eof) {
   2717        ret |= BDRV_BLOCK_EOF;
   2718    }
   2719
   2720    return ret;
   2721}
   2722
   2723int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
   2724                            int64_t offset, int64_t bytes, int64_t *pnum,
   2725                            int64_t *map, BlockDriverState **file)
   2726{
   2727    return bdrv_common_block_status_above(bs, base, false, true, offset, bytes,
   2728                                          pnum, map, file, NULL);
   2729}
   2730
   2731int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
   2732                      int64_t *pnum, int64_t *map, BlockDriverState **file)
   2733{
   2734    return bdrv_block_status_above(bs, bdrv_filter_or_cow_bs(bs),
   2735                                   offset, bytes, pnum, map, file);
   2736}
   2737
   2738/*
   2739 * Check @bs (and its backing chain) to see if the range defined
   2740 * by @offset and @bytes is known to read as zeroes.
   2741 * Return 1 if that is the case, 0 otherwise and -errno on error.
   2742 * This test is meant to be fast rather than accurate so returning 0
   2743 * does not guarantee non-zero data.
   2744 */
   2745int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
   2746                                      int64_t bytes)
   2747{
   2748    int ret;
   2749    int64_t pnum = bytes;
   2750
   2751    if (!bytes) {
   2752        return 1;
   2753    }
   2754
   2755    ret = bdrv_common_block_status_above(bs, NULL, false, false, offset,
   2756                                         bytes, &pnum, NULL, NULL, NULL);
   2757
   2758    if (ret < 0) {
   2759        return ret;
   2760    }
   2761
   2762    return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO);
   2763}
   2764
   2765int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
   2766                                   int64_t bytes, int64_t *pnum)
   2767{
   2768    int ret;
   2769    int64_t dummy;
   2770
   2771    ret = bdrv_common_block_status_above(bs, bs, true, false, offset,
   2772                                         bytes, pnum ? pnum : &dummy, NULL,
   2773                                         NULL, NULL);
   2774    if (ret < 0) {
   2775        return ret;
   2776    }
   2777    return !!(ret & BDRV_BLOCK_ALLOCATED);
   2778}
   2779
   2780/*
   2781 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
   2782 *
   2783 * Return a positive depth if (a prefix of) the given range is allocated
   2784 * in any image between BASE and TOP (BASE is only included if include_base
   2785 * is set).  Depth 1 is TOP, 2 is the first backing layer, and so forth.
   2786 * BASE can be NULL to check if the given offset is allocated in any
   2787 * image of the chain.  Return 0 otherwise, or negative errno on
   2788 * failure.
   2789 *
   2790 * 'pnum' is set to the number of bytes (including and immediately
   2791 * following the specified offset) that are known to be in the same
   2792 * allocated/unallocated state.  Note that a subsequent call starting
   2793 * at 'offset + *pnum' may return the same allocation status (in other
   2794 * words, the result is not necessarily the maximum possible range);
   2795 * but 'pnum' will only be 0 when end of file is reached.
   2796 */
   2797int bdrv_is_allocated_above(BlockDriverState *top,
   2798                            BlockDriverState *base,
   2799                            bool include_base, int64_t offset,
   2800                            int64_t bytes, int64_t *pnum)
   2801{
   2802    int depth;
   2803    int ret = bdrv_common_block_status_above(top, base, include_base, false,
   2804                                             offset, bytes, pnum, NULL, NULL,
   2805                                             &depth);
   2806    if (ret < 0) {
   2807        return ret;
   2808    }
   2809
   2810    if (ret & BDRV_BLOCK_ALLOCATED) {
   2811        return depth;
   2812    }
   2813    return 0;
   2814}
   2815
   2816int coroutine_fn
   2817bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
   2818{
   2819    BlockDriver *drv = bs->drv;
   2820    BlockDriverState *child_bs = bdrv_primary_bs(bs);
   2821    int ret;
   2822
   2823    ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
   2824    if (ret < 0) {
   2825        return ret;
   2826    }
   2827
   2828    if (!drv) {
   2829        return -ENOMEDIUM;
   2830    }
   2831
   2832    bdrv_inc_in_flight(bs);
   2833
   2834    if (drv->bdrv_load_vmstate) {
   2835        ret = drv->bdrv_load_vmstate(bs, qiov, pos);
   2836    } else if (child_bs) {
   2837        ret = bdrv_co_readv_vmstate(child_bs, qiov, pos);
   2838    } else {
   2839        ret = -ENOTSUP;
   2840    }
   2841
   2842    bdrv_dec_in_flight(bs);
   2843
   2844    return ret;
   2845}
   2846
   2847int coroutine_fn
   2848bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
   2849{
   2850    BlockDriver *drv = bs->drv;
   2851    BlockDriverState *child_bs = bdrv_primary_bs(bs);
   2852    int ret;
   2853
   2854    ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
   2855    if (ret < 0) {
   2856        return ret;
   2857    }
   2858
   2859    if (!drv) {
   2860        return -ENOMEDIUM;
   2861    }
   2862
   2863    bdrv_inc_in_flight(bs);
   2864
   2865    if (drv->bdrv_save_vmstate) {
   2866        ret = drv->bdrv_save_vmstate(bs, qiov, pos);
   2867    } else if (child_bs) {
   2868        ret = bdrv_co_writev_vmstate(child_bs, qiov, pos);
   2869    } else {
   2870        ret = -ENOTSUP;
   2871    }
   2872
   2873    bdrv_dec_in_flight(bs);
   2874
   2875    return ret;
   2876}
   2877
   2878int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
   2879                      int64_t pos, int size)
   2880{
   2881    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
   2882    int ret = bdrv_writev_vmstate(bs, &qiov, pos);
   2883
   2884    return ret < 0 ? ret : size;
   2885}
   2886
   2887int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
   2888                      int64_t pos, int size)
   2889{
   2890    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
   2891    int ret = bdrv_readv_vmstate(bs, &qiov, pos);
   2892
   2893    return ret < 0 ? ret : size;
   2894}
   2895
   2896/**************************************************************/
   2897/* async I/Os */
   2898
   2899void bdrv_aio_cancel(BlockAIOCB *acb)
   2900{
   2901    qemu_aio_ref(acb);
   2902    bdrv_aio_cancel_async(acb);
   2903    while (acb->refcnt > 1) {
   2904        if (acb->aiocb_info->get_aio_context) {
   2905            aio_poll(acb->aiocb_info->get_aio_context(acb), true);
   2906        } else if (acb->bs) {
   2907            /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
   2908             * assert that we're not using an I/O thread.  Thread-safe
   2909             * code should use bdrv_aio_cancel_async exclusively.
   2910             */
   2911            assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
   2912            aio_poll(bdrv_get_aio_context(acb->bs), true);
   2913        } else {
   2914            abort();
   2915        }
   2916    }
   2917    qemu_aio_unref(acb);
   2918}
   2919
   2920/* Async version of aio cancel. The caller is not blocked if the acb implements
   2921 * cancel_async, otherwise we do nothing and let the request normally complete.
   2922 * In either case the completion callback must be called. */
   2923void bdrv_aio_cancel_async(BlockAIOCB *acb)
   2924{
   2925    if (acb->aiocb_info->cancel_async) {
   2926        acb->aiocb_info->cancel_async(acb);
   2927    }
   2928}
   2929
   2930/**************************************************************/
   2931/* Coroutine block device emulation */
   2932
   2933int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
   2934{
   2935    BdrvChild *primary_child = bdrv_primary_child(bs);
   2936    BdrvChild *child;
   2937    int current_gen;
   2938    int ret = 0;
   2939
   2940    bdrv_inc_in_flight(bs);
   2941
   2942    if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
   2943        bdrv_is_sg(bs)) {
   2944        goto early_exit;
   2945    }
   2946
   2947    qemu_co_mutex_lock(&bs->reqs_lock);
   2948    current_gen = qatomic_read(&bs->write_gen);
   2949
   2950    /* Wait until any previous flushes are completed */
   2951    while (bs->active_flush_req) {
   2952        qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
   2953    }
   2954
   2955    /* Flushes reach this point in nondecreasing current_gen order.  */
   2956    bs->active_flush_req = true;
   2957    qemu_co_mutex_unlock(&bs->reqs_lock);
   2958
   2959    /* Write back all layers by calling one driver function */
   2960    if (bs->drv->bdrv_co_flush) {
   2961        ret = bs->drv->bdrv_co_flush(bs);
   2962        goto out;
   2963    }
   2964
   2965    /* Write back cached data to the OS even with cache=unsafe */
   2966    BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_OS);
   2967    if (bs->drv->bdrv_co_flush_to_os) {
   2968        ret = bs->drv->bdrv_co_flush_to_os(bs);
   2969        if (ret < 0) {
   2970            goto out;
   2971        }
   2972    }
   2973
   2974    /* But don't actually force it to the disk with cache=unsafe */
   2975    if (bs->open_flags & BDRV_O_NO_FLUSH) {
   2976        goto flush_children;
   2977    }
   2978
   2979    /* Check if we really need to flush anything */
   2980    if (bs->flushed_gen == current_gen) {
   2981        goto flush_children;
   2982    }
   2983
   2984    BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK);
   2985    if (!bs->drv) {
   2986        /* bs->drv->bdrv_co_flush() might have ejected the BDS
   2987         * (even in case of apparent success) */
   2988        ret = -ENOMEDIUM;
   2989        goto out;
   2990    }
   2991    if (bs->drv->bdrv_co_flush_to_disk) {
   2992        ret = bs->drv->bdrv_co_flush_to_disk(bs);
   2993    } else if (bs->drv->bdrv_aio_flush) {
   2994        BlockAIOCB *acb;
   2995        CoroutineIOCompletion co = {
   2996            .coroutine = qemu_coroutine_self(),
   2997        };
   2998
   2999        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
   3000        if (acb == NULL) {
   3001            ret = -EIO;
   3002        } else {
   3003            qemu_coroutine_yield();
   3004            ret = co.ret;
   3005        }
   3006    } else {
   3007        /*
   3008         * Some block drivers always operate in either writethrough or unsafe
   3009         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
   3010         * know how the server works (because the behaviour is hardcoded or
   3011         * depends on server-side configuration), so we can't ensure that
   3012         * everything is safe on disk. Returning an error doesn't work because
   3013         * that would break guests even if the server operates in writethrough
   3014         * mode.
   3015         *
   3016         * Let's hope the user knows what he's doing.
   3017         */
   3018        ret = 0;
   3019    }
   3020
   3021    if (ret < 0) {
   3022        goto out;
   3023    }
   3024
   3025    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
   3026     * in the case of cache=unsafe, so there are no useless flushes.
   3027     */
   3028flush_children:
   3029    ret = 0;
   3030    QLIST_FOREACH(child, &bs->children, next) {
   3031        if (child->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) {
   3032            int this_child_ret = bdrv_co_flush(child->bs);
   3033            if (!ret) {
   3034                ret = this_child_ret;
   3035            }
   3036        }
   3037    }
   3038
   3039out:
   3040    /* Notify any pending flushes that we have completed */
   3041    if (ret == 0) {
   3042        bs->flushed_gen = current_gen;
   3043    }
   3044
   3045    qemu_co_mutex_lock(&bs->reqs_lock);
   3046    bs->active_flush_req = false;
   3047    /* Return value is ignored - it's ok if wait queue is empty */
   3048    qemu_co_queue_next(&bs->flush_queue);
   3049    qemu_co_mutex_unlock(&bs->reqs_lock);
   3050
   3051early_exit:
   3052    bdrv_dec_in_flight(bs);
   3053    return ret;
   3054}
   3055
   3056int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
   3057                                  int64_t bytes)
   3058{
   3059    BdrvTrackedRequest req;
   3060    int ret;
   3061    int64_t max_pdiscard;
   3062    int head, tail, align;
   3063    BlockDriverState *bs = child->bs;
   3064
   3065    if (!bs || !bs->drv || !bdrv_is_inserted(bs)) {
   3066        return -ENOMEDIUM;
   3067    }
   3068
   3069    if (bdrv_has_readonly_bitmaps(bs)) {
   3070        return -EPERM;
   3071    }
   3072
   3073    ret = bdrv_check_request(offset, bytes, NULL);
   3074    if (ret < 0) {
   3075        return ret;
   3076    }
   3077
   3078    /* Do nothing if disabled.  */
   3079    if (!(bs->open_flags & BDRV_O_UNMAP)) {
   3080        return 0;
   3081    }
   3082
   3083    if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
   3084        return 0;
   3085    }
   3086
   3087    /* Invalidate the cached block-status data range if this discard overlaps */
   3088    bdrv_bsc_invalidate_range(bs, offset, bytes);
   3089
   3090    /* Discard is advisory, but some devices track and coalesce
   3091     * unaligned requests, so we must pass everything down rather than
   3092     * round here.  Still, most devices will just silently ignore
   3093     * unaligned requests (by returning -ENOTSUP), so we must fragment
   3094     * the request accordingly.  */
   3095    align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
   3096    assert(align % bs->bl.request_alignment == 0);
   3097    head = offset % align;
   3098    tail = (offset + bytes) % align;
   3099
   3100    bdrv_inc_in_flight(bs);
   3101    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
   3102
   3103    ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
   3104    if (ret < 0) {
   3105        goto out;
   3106    }
   3107
   3108    max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT64_MAX),
   3109                                   align);
   3110    assert(max_pdiscard >= bs->bl.request_alignment);
   3111
   3112    while (bytes > 0) {
   3113        int64_t num = bytes;
   3114
   3115        if (head) {
   3116            /* Make small requests to get to alignment boundaries. */
   3117            num = MIN(bytes, align - head);
   3118            if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
   3119                num %= bs->bl.request_alignment;
   3120            }
   3121            head = (head + num) % align;
   3122            assert(num < max_pdiscard);
   3123        } else if (tail) {
   3124            if (num > align) {
   3125                /* Shorten the request to the last aligned cluster.  */
   3126                num -= tail;
   3127            } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
   3128                       tail > bs->bl.request_alignment) {
   3129                tail %= bs->bl.request_alignment;
   3130                num -= tail;
   3131            }
   3132        }
   3133        /* limit request size */
   3134        if (num > max_pdiscard) {
   3135            num = max_pdiscard;
   3136        }
   3137
   3138        if (!bs->drv) {
   3139            ret = -ENOMEDIUM;
   3140            goto out;
   3141        }
   3142        if (bs->drv->bdrv_co_pdiscard) {
   3143            ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
   3144        } else {
   3145            BlockAIOCB *acb;
   3146            CoroutineIOCompletion co = {
   3147                .coroutine = qemu_coroutine_self(),
   3148            };
   3149
   3150            acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
   3151                                             bdrv_co_io_em_complete, &co);
   3152            if (acb == NULL) {
   3153                ret = -EIO;
   3154                goto out;
   3155            } else {
   3156                qemu_coroutine_yield();
   3157                ret = co.ret;
   3158            }
   3159        }
   3160        if (ret && ret != -ENOTSUP) {
   3161            goto out;
   3162        }
   3163
   3164        offset += num;
   3165        bytes -= num;
   3166    }
   3167    ret = 0;
   3168out:
   3169    bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
   3170    tracked_request_end(&req);
   3171    bdrv_dec_in_flight(bs);
   3172    return ret;
   3173}
   3174
   3175int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
   3176{
   3177    BlockDriver *drv = bs->drv;
   3178    CoroutineIOCompletion co = {
   3179        .coroutine = qemu_coroutine_self(),
   3180    };
   3181    BlockAIOCB *acb;
   3182
   3183    bdrv_inc_in_flight(bs);
   3184    if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
   3185        co.ret = -ENOTSUP;
   3186        goto out;
   3187    }
   3188
   3189    if (drv->bdrv_co_ioctl) {
   3190        co.ret = drv->bdrv_co_ioctl(bs, req, buf);
   3191    } else {
   3192        acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
   3193        if (!acb) {
   3194            co.ret = -ENOTSUP;
   3195            goto out;
   3196        }
   3197        qemu_coroutine_yield();
   3198    }
   3199out:
   3200    bdrv_dec_in_flight(bs);
   3201    return co.ret;
   3202}
   3203
   3204void *qemu_blockalign(BlockDriverState *bs, size_t size)
   3205{
   3206    return qemu_memalign(bdrv_opt_mem_align(bs), size);
   3207}
   3208
   3209void *qemu_blockalign0(BlockDriverState *bs, size_t size)
   3210{
   3211    return memset(qemu_blockalign(bs, size), 0, size);
   3212}
   3213
   3214void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
   3215{
   3216    size_t align = bdrv_opt_mem_align(bs);
   3217
   3218    /* Ensure that NULL is never returned on success */
   3219    assert(align > 0);
   3220    if (size == 0) {
   3221        size = align;
   3222    }
   3223
   3224    return qemu_try_memalign(align, size);
   3225}
   3226
   3227void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
   3228{
   3229    void *mem = qemu_try_blockalign(bs, size);
   3230
   3231    if (mem) {
   3232        memset(mem, 0, size);
   3233    }
   3234
   3235    return mem;
   3236}
   3237
   3238/*
   3239 * Check if all memory in this vector is sector aligned.
   3240 */
   3241bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
   3242{
   3243    int i;
   3244    size_t alignment = bdrv_min_mem_align(bs);
   3245
   3246    for (i = 0; i < qiov->niov; i++) {
   3247        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
   3248            return false;
   3249        }
   3250        if (qiov->iov[i].iov_len % alignment) {
   3251            return false;
   3252        }
   3253    }
   3254
   3255    return true;
   3256}
   3257
   3258void bdrv_io_plug(BlockDriverState *bs)
   3259{
   3260    BdrvChild *child;
   3261
   3262    QLIST_FOREACH(child, &bs->children, next) {
   3263        bdrv_io_plug(child->bs);
   3264    }
   3265
   3266    if (qatomic_fetch_inc(&bs->io_plugged) == 0) {
   3267        BlockDriver *drv = bs->drv;
   3268        if (drv && drv->bdrv_io_plug) {
   3269            drv->bdrv_io_plug(bs);
   3270        }
   3271    }
   3272}
   3273
   3274void bdrv_io_unplug(BlockDriverState *bs)
   3275{
   3276    BdrvChild *child;
   3277
   3278    assert(bs->io_plugged);
   3279    if (qatomic_fetch_dec(&bs->io_plugged) == 1) {
   3280        BlockDriver *drv = bs->drv;
   3281        if (drv && drv->bdrv_io_unplug) {
   3282            drv->bdrv_io_unplug(bs);
   3283        }
   3284    }
   3285
   3286    QLIST_FOREACH(child, &bs->children, next) {
   3287        bdrv_io_unplug(child->bs);
   3288    }
   3289}
   3290
   3291void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
   3292{
   3293    BdrvChild *child;
   3294
   3295    if (bs->drv && bs->drv->bdrv_register_buf) {
   3296        bs->drv->bdrv_register_buf(bs, host, size);
   3297    }
   3298    QLIST_FOREACH(child, &bs->children, next) {
   3299        bdrv_register_buf(child->bs, host, size);
   3300    }
   3301}
   3302
   3303void bdrv_unregister_buf(BlockDriverState *bs, void *host)
   3304{
   3305    BdrvChild *child;
   3306
   3307    if (bs->drv && bs->drv->bdrv_unregister_buf) {
   3308        bs->drv->bdrv_unregister_buf(bs, host);
   3309    }
   3310    QLIST_FOREACH(child, &bs->children, next) {
   3311        bdrv_unregister_buf(child->bs, host);
   3312    }
   3313}
   3314
   3315static int coroutine_fn bdrv_co_copy_range_internal(
   3316        BdrvChild *src, int64_t src_offset, BdrvChild *dst,
   3317        int64_t dst_offset, int64_t bytes,
   3318        BdrvRequestFlags read_flags, BdrvRequestFlags write_flags,
   3319        bool recurse_src)
   3320{
   3321    BdrvTrackedRequest req;
   3322    int ret;
   3323
   3324    /* TODO We can support BDRV_REQ_NO_FALLBACK here */
   3325    assert(!(read_flags & BDRV_REQ_NO_FALLBACK));
   3326    assert(!(write_flags & BDRV_REQ_NO_FALLBACK));
   3327
   3328    if (!dst || !dst->bs || !bdrv_is_inserted(dst->bs)) {
   3329        return -ENOMEDIUM;
   3330    }
   3331    ret = bdrv_check_request32(dst_offset, bytes, NULL, 0);
   3332    if (ret) {
   3333        return ret;
   3334    }
   3335    if (write_flags & BDRV_REQ_ZERO_WRITE) {
   3336        return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags);
   3337    }
   3338
   3339    if (!src || !src->bs || !bdrv_is_inserted(src->bs)) {
   3340        return -ENOMEDIUM;
   3341    }
   3342    ret = bdrv_check_request32(src_offset, bytes, NULL, 0);
   3343    if (ret) {
   3344        return ret;
   3345    }
   3346
   3347    if (!src->bs->drv->bdrv_co_copy_range_from
   3348        || !dst->bs->drv->bdrv_co_copy_range_to
   3349        || src->bs->encrypted || dst->bs->encrypted) {
   3350        return -ENOTSUP;
   3351    }
   3352
   3353    if (recurse_src) {
   3354        bdrv_inc_in_flight(src->bs);
   3355        tracked_request_begin(&req, src->bs, src_offset, bytes,
   3356                              BDRV_TRACKED_READ);
   3357
   3358        /* BDRV_REQ_SERIALISING is only for write operation */
   3359        assert(!(read_flags & BDRV_REQ_SERIALISING));
   3360        bdrv_wait_serialising_requests(&req);
   3361
   3362        ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
   3363                                                    src, src_offset,
   3364                                                    dst, dst_offset,
   3365                                                    bytes,
   3366                                                    read_flags, write_flags);
   3367
   3368        tracked_request_end(&req);
   3369        bdrv_dec_in_flight(src->bs);
   3370    } else {
   3371        bdrv_inc_in_flight(dst->bs);
   3372        tracked_request_begin(&req, dst->bs, dst_offset, bytes,
   3373                              BDRV_TRACKED_WRITE);
   3374        ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req,
   3375                                        write_flags);
   3376        if (!ret) {
   3377            ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
   3378                                                      src, src_offset,
   3379                                                      dst, dst_offset,
   3380                                                      bytes,
   3381                                                      read_flags, write_flags);
   3382        }
   3383        bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret);
   3384        tracked_request_end(&req);
   3385        bdrv_dec_in_flight(dst->bs);
   3386    }
   3387
   3388    return ret;
   3389}
   3390
   3391/* Copy range from @src to @dst.
   3392 *
   3393 * See the comment of bdrv_co_copy_range for the parameter and return value
   3394 * semantics. */
   3395int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset,
   3396                                         BdrvChild *dst, int64_t dst_offset,
   3397                                         int64_t bytes,
   3398                                         BdrvRequestFlags read_flags,
   3399                                         BdrvRequestFlags write_flags)
   3400{
   3401    trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes,
   3402                                  read_flags, write_flags);
   3403    return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
   3404                                       bytes, read_flags, write_flags, true);
   3405}
   3406
   3407/* Copy range from @src to @dst.
   3408 *
   3409 * See the comment of bdrv_co_copy_range for the parameter and return value
   3410 * semantics. */
   3411int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset,
   3412                                       BdrvChild *dst, int64_t dst_offset,
   3413                                       int64_t bytes,
   3414                                       BdrvRequestFlags read_flags,
   3415                                       BdrvRequestFlags write_flags)
   3416{
   3417    trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
   3418                                read_flags, write_flags);
   3419    return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
   3420                                       bytes, read_flags, write_flags, false);
   3421}
   3422
   3423int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset,
   3424                                    BdrvChild *dst, int64_t dst_offset,
   3425                                    int64_t bytes, BdrvRequestFlags read_flags,
   3426                                    BdrvRequestFlags write_flags)
   3427{
   3428    return bdrv_co_copy_range_from(src, src_offset,
   3429                                   dst, dst_offset,
   3430                                   bytes, read_flags, write_flags);
   3431}
   3432
   3433static void bdrv_parent_cb_resize(BlockDriverState *bs)
   3434{
   3435    BdrvChild *c;
   3436    QLIST_FOREACH(c, &bs->parents, next_parent) {
   3437        if (c->klass->resize) {
   3438            c->klass->resize(c);
   3439        }
   3440    }
   3441}
   3442
   3443/**
   3444 * Truncate file to 'offset' bytes (needed only for file protocols)
   3445 *
   3446 * If 'exact' is true, the file must be resized to exactly the given
   3447 * 'offset'.  Otherwise, it is sufficient for the node to be at least
   3448 * 'offset' bytes in length.
   3449 */
   3450int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
   3451                                  PreallocMode prealloc, BdrvRequestFlags flags,
   3452                                  Error **errp)
   3453{
   3454    BlockDriverState *bs = child->bs;
   3455    BdrvChild *filtered, *backing;
   3456    BlockDriver *drv = bs->drv;
   3457    BdrvTrackedRequest req;
   3458    int64_t old_size, new_bytes;
   3459    int ret;
   3460
   3461
   3462    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
   3463    if (!drv) {
   3464        error_setg(errp, "No medium inserted");
   3465        return -ENOMEDIUM;
   3466    }
   3467    if (offset < 0) {
   3468        error_setg(errp, "Image size cannot be negative");
   3469        return -EINVAL;
   3470    }
   3471
   3472    ret = bdrv_check_request(offset, 0, errp);
   3473    if (ret < 0) {
   3474        return ret;
   3475    }
   3476
   3477    old_size = bdrv_getlength(bs);
   3478    if (old_size < 0) {
   3479        error_setg_errno(errp, -old_size, "Failed to get old image size");
   3480        return old_size;
   3481    }
   3482
   3483    if (bdrv_is_read_only(bs)) {
   3484        error_setg(errp, "Image is read-only");
   3485        return -EACCES;
   3486    }
   3487
   3488    if (offset > old_size) {
   3489        new_bytes = offset - old_size;
   3490    } else {
   3491        new_bytes = 0;
   3492    }
   3493
   3494    bdrv_inc_in_flight(bs);
   3495    tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
   3496                          BDRV_TRACKED_TRUNCATE);
   3497
   3498    /* If we are growing the image and potentially using preallocation for the
   3499     * new area, we need to make sure that no write requests are made to it
   3500     * concurrently or they might be overwritten by preallocation. */
   3501    if (new_bytes) {
   3502        bdrv_make_request_serialising(&req, 1);
   3503    }
   3504    ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req,
   3505                                    0);
   3506    if (ret < 0) {
   3507        error_setg_errno(errp, -ret,
   3508                         "Failed to prepare request for truncation");
   3509        goto out;
   3510    }
   3511
   3512    filtered = bdrv_filter_child(bs);
   3513    backing = bdrv_cow_child(bs);
   3514
   3515    /*
   3516     * If the image has a backing file that is large enough that it would
   3517     * provide data for the new area, we cannot leave it unallocated because
   3518     * then the backing file content would become visible. Instead, zero-fill
   3519     * the new area.
   3520     *
   3521     * Note that if the image has a backing file, but was opened without the
   3522     * backing file, taking care of keeping things consistent with that backing
   3523     * file is the user's responsibility.
   3524     */
   3525    if (new_bytes && backing) {
   3526        int64_t backing_len;
   3527
   3528        backing_len = bdrv_getlength(backing->bs);
   3529        if (backing_len < 0) {
   3530            ret = backing_len;
   3531            error_setg_errno(errp, -ret, "Could not get backing file size");
   3532            goto out;
   3533        }
   3534
   3535        if (backing_len > old_size) {
   3536            flags |= BDRV_REQ_ZERO_WRITE;
   3537        }
   3538    }
   3539
   3540    if (drv->bdrv_co_truncate) {
   3541        if (flags & ~bs->supported_truncate_flags) {
   3542            error_setg(errp, "Block driver does not support requested flags");
   3543            ret = -ENOTSUP;
   3544            goto out;
   3545        }
   3546        ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp);
   3547    } else if (filtered) {
   3548        ret = bdrv_co_truncate(filtered, offset, exact, prealloc, flags, errp);
   3549    } else {
   3550        error_setg(errp, "Image format driver does not support resize");
   3551        ret = -ENOTSUP;
   3552        goto out;
   3553    }
   3554    if (ret < 0) {
   3555        goto out;
   3556    }
   3557
   3558    ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
   3559    if (ret < 0) {
   3560        error_setg_errno(errp, -ret, "Could not refresh total sector count");
   3561    } else {
   3562        offset = bs->total_sectors * BDRV_SECTOR_SIZE;
   3563    }
   3564    /* It's possible that truncation succeeded but refresh_total_sectors
   3565     * failed, but the latter doesn't affect how we should finish the request.
   3566     * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */
   3567    bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0);
   3568
   3569out:
   3570    tracked_request_end(&req);
   3571    bdrv_dec_in_flight(bs);
   3572
   3573    return ret;
   3574}
   3575
   3576void bdrv_cancel_in_flight(BlockDriverState *bs)
   3577{
   3578    if (!bs || !bs->drv) {
   3579        return;
   3580    }
   3581
   3582    if (bs->drv->bdrv_cancel_in_flight) {
   3583        bs->drv->bdrv_cancel_in_flight(bs);
   3584    }
   3585}