qcow2-cluster.c - cachepc-qemu - Fork of AMDESE/qemu with changes for cachepc side-channel attack

	cachepc-qemu Fork of AMDESE/qemu with changes for cachepc side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-qemu
	Log \| Files \| Refs \| Submodules \| LICENSE \| sfeed.txt
qcow2-cluster.c (87089B)
      1/*
      2 * Block driver for the QCOW version 2 format
      3 *
      4 * Copyright (c) 2004-2006 Fabrice Bellard
      5 *
      6 * Permission is hereby granted, free of charge, to any person obtaining a copy
      7 * of this software and associated documentation files (the "Software"), to deal
      8 * in the Software without restriction, including without limitation the rights
      9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10 * copies of the Software, and to permit persons to whom the Software is
     11 * furnished to do so, subject to the following conditions:
     12 *
     13 * The above copyright notice and this permission notice shall be included in
     14 * all copies or substantial portions of the Software.
     15 *
     16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22 * THE SOFTWARE.
     23 */
     24
     25#include "qemu/osdep.h"
     26#include <zlib.h>
     27
     28#include "qapi/error.h"
     29#include "qcow2.h"
     30#include "qemu/bswap.h"
     31#include "trace.h"
     32
     33int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size)
     34{
     35    BDRVQcow2State *s = bs->opaque;
     36    int new_l1_size, i, ret;
     37
     38    if (exact_size >= s->l1_size) {
     39        return 0;
     40    }
     41
     42    new_l1_size = exact_size;
     43
     44#ifdef DEBUG_ALLOC2
     45    fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
     46#endif
     47
     48    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
     49    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
     50                                       new_l1_size * L1E_SIZE,
     51                             (s->l1_size - new_l1_size) * L1E_SIZE, 0);
     52    if (ret < 0) {
     53        goto fail;
     54    }
     55
     56    ret = bdrv_flush(bs->file->bs);
     57    if (ret < 0) {
     58        goto fail;
     59    }
     60
     61    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
     62    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
     63        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
     64            continue;
     65        }
     66        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
     67                            s->cluster_size, QCOW2_DISCARD_ALWAYS);
     68        s->l1_table[i] = 0;
     69    }
     70    return 0;
     71
     72fail:
     73    /*
     74     * If the write in the l1_table failed the image may contain a partially
     75     * overwritten l1_table. In this case it would be better to clear the
     76     * l1_table in memory to avoid possible image corruption.
     77     */
     78    memset(s->l1_table + new_l1_size, 0,
     79           (s->l1_size - new_l1_size) * L1E_SIZE);
     80    return ret;
     81}
     82
     83int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
     84                        bool exact_size)
     85{
     86    BDRVQcow2State *s = bs->opaque;
     87    int new_l1_size2, ret, i;
     88    uint64_t *new_l1_table;
     89    int64_t old_l1_table_offset, old_l1_size;
     90    int64_t new_l1_table_offset, new_l1_size;
     91    uint8_t data[12];
     92
     93    if (min_size <= s->l1_size)
     94        return 0;
     95
     96    /* Do a sanity check on min_size before trying to calculate new_l1_size
     97     * (this prevents overflows during the while loop for the calculation of
     98     * new_l1_size) */
     99    if (min_size > INT_MAX / L1E_SIZE) {
    100        return -EFBIG;
    101    }
    102
    103    if (exact_size) {
    104        new_l1_size = min_size;
    105    } else {
    106        /* Bump size up to reduce the number of times we have to grow */
    107        new_l1_size = s->l1_size;
    108        if (new_l1_size == 0) {
    109            new_l1_size = 1;
    110        }
    111        while (min_size > new_l1_size) {
    112            new_l1_size = DIV_ROUND_UP(new_l1_size * 3, 2);
    113        }
    114    }
    115
    116    QEMU_BUILD_BUG_ON(QCOW_MAX_L1_SIZE > INT_MAX);
    117    if (new_l1_size > QCOW_MAX_L1_SIZE / L1E_SIZE) {
    118        return -EFBIG;
    119    }
    120
    121#ifdef DEBUG_ALLOC2
    122    fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
    123            s->l1_size, new_l1_size);
    124#endif
    125
    126    new_l1_size2 = L1E_SIZE * new_l1_size;
    127    new_l1_table = qemu_try_blockalign(bs->file->bs, new_l1_size2);
    128    if (new_l1_table == NULL) {
    129        return -ENOMEM;
    130    }
    131    memset(new_l1_table, 0, new_l1_size2);
    132
    133    if (s->l1_size) {
    134        memcpy(new_l1_table, s->l1_table, s->l1_size * L1E_SIZE);
    135    }
    136
    137    /* write new table (align to cluster) */
    138    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
    139    new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
    140    if (new_l1_table_offset < 0) {
    141        qemu_vfree(new_l1_table);
    142        return new_l1_table_offset;
    143    }
    144
    145    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
    146    if (ret < 0) {
    147        goto fail;
    148    }
    149
    150    /* the L1 position has not yet been updated, so these clusters must
    151     * indeed be completely free */
    152    ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset,
    153                                        new_l1_size2, false);
    154    if (ret < 0) {
    155        goto fail;
    156    }
    157
    158    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
    159    for(i = 0; i < s->l1_size; i++)
    160        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
    161    ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset,
    162                           new_l1_table, new_l1_size2);
    163    if (ret < 0)
    164        goto fail;
    165    for(i = 0; i < s->l1_size; i++)
    166        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
    167
    168    /* set new table */
    169    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
    170    stl_be_p(data, new_l1_size);
    171    stq_be_p(data + 4, new_l1_table_offset);
    172    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size),
    173                           data, sizeof(data));
    174    if (ret < 0) {
    175        goto fail;
    176    }
    177    qemu_vfree(s->l1_table);
    178    old_l1_table_offset = s->l1_table_offset;
    179    s->l1_table_offset = new_l1_table_offset;
    180    s->l1_table = new_l1_table;
    181    old_l1_size = s->l1_size;
    182    s->l1_size = new_l1_size;
    183    qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * L1E_SIZE,
    184                        QCOW2_DISCARD_OTHER);
    185    return 0;
    186 fail:
    187    qemu_vfree(new_l1_table);
    188    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
    189                        QCOW2_DISCARD_OTHER);
    190    return ret;
    191}
    192
    193/*
    194 * l2_load
    195 *
    196 * @bs: The BlockDriverState
    197 * @offset: A guest offset, used to calculate what slice of the L2
    198 *          table to load.
    199 * @l2_offset: Offset to the L2 table in the image file.
    200 * @l2_slice: Location to store the pointer to the L2 slice.
    201 *
    202 * Loads a L2 slice into memory (L2 slices are the parts of L2 tables
    203 * that are loaded by the qcow2 cache). If the slice is in the cache,
    204 * the cache is used; otherwise the L2 slice is loaded from the image
    205 * file.
    206 */
    207static int l2_load(BlockDriverState *bs, uint64_t offset,
    208                   uint64_t l2_offset, uint64_t **l2_slice)
    209{
    210    BDRVQcow2State *s = bs->opaque;
    211    int start_of_slice = l2_entry_size(s) *
    212        (offset_to_l2_index(s, offset) - offset_to_l2_slice_index(s, offset));
    213
    214    return qcow2_cache_get(bs, s->l2_table_cache, l2_offset + start_of_slice,
    215                           (void **)l2_slice);
    216}
    217
    218/*
    219 * Writes an L1 entry to disk (note that depending on the alignment
    220 * requirements this function may write more that just one entry in
    221 * order to prevent bdrv_pwrite from performing a read-modify-write)
    222 */
    223int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
    224{
    225    BDRVQcow2State *s = bs->opaque;
    226    int l1_start_index;
    227    int i, ret;
    228    int bufsize = MAX(L1E_SIZE,
    229                      MIN(bs->file->bs->bl.request_alignment, s->cluster_size));
    230    int nentries = bufsize / L1E_SIZE;
    231    g_autofree uint64_t *buf = g_try_new0(uint64_t, nentries);
    232
    233    if (buf == NULL) {
    234        return -ENOMEM;
    235    }
    236
    237    l1_start_index = QEMU_ALIGN_DOWN(l1_index, nentries);
    238    for (i = 0; i < MIN(nentries, s->l1_size - l1_start_index); i++) {
    239        buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
    240    }
    241
    242    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1,
    243            s->l1_table_offset + L1E_SIZE * l1_start_index, bufsize, false);
    244    if (ret < 0) {
    245        return ret;
    246    }
    247
    248    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
    249    ret = bdrv_pwrite_sync(bs->file,
    250                           s->l1_table_offset + L1E_SIZE * l1_start_index,
    251                           buf, bufsize);
    252    if (ret < 0) {
    253        return ret;
    254    }
    255
    256    return 0;
    257}
    258
    259/*
    260 * l2_allocate
    261 *
    262 * Allocate a new l2 entry in the file. If l1_index points to an already
    263 * used entry in the L2 table (i.e. we are doing a copy on write for the L2
    264 * table) copy the contents of the old L2 table into the newly allocated one.
    265 * Otherwise the new table is initialized with zeros.
    266 *
    267 */
    268
    269static int l2_allocate(BlockDriverState *bs, int l1_index)
    270{
    271    BDRVQcow2State *s = bs->opaque;
    272    uint64_t old_l2_offset;
    273    uint64_t *l2_slice = NULL;
    274    unsigned slice, slice_size2, n_slices;
    275    int64_t l2_offset;
    276    int ret;
    277
    278    old_l2_offset = s->l1_table[l1_index];
    279
    280    trace_qcow2_l2_allocate(bs, l1_index);
    281
    282    /* allocate a new l2 entry */
    283
    284    l2_offset = qcow2_alloc_clusters(bs, s->l2_size * l2_entry_size(s));
    285    if (l2_offset < 0) {
    286        ret = l2_offset;
    287        goto fail;
    288    }
    289
    290    /* The offset must fit in the offset field of the L1 table entry */
    291    assert((l2_offset & L1E_OFFSET_MASK) == l2_offset);
    292
    293    /* If we're allocating the table at offset 0 then something is wrong */
    294    if (l2_offset == 0) {
    295        qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
    296                                "allocation of L2 table at offset 0");
    297        ret = -EIO;
    298        goto fail;
    299    }
    300
    301    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
    302    if (ret < 0) {
    303        goto fail;
    304    }
    305
    306    /* allocate a new entry in the l2 cache */
    307
    308    slice_size2 = s->l2_slice_size * l2_entry_size(s);
    309    n_slices = s->cluster_size / slice_size2;
    310
    311    trace_qcow2_l2_allocate_get_empty(bs, l1_index);
    312    for (slice = 0; slice < n_slices; slice++) {
    313        ret = qcow2_cache_get_empty(bs, s->l2_table_cache,
    314                                    l2_offset + slice * slice_size2,
    315                                    (void **) &l2_slice);
    316        if (ret < 0) {
    317            goto fail;
    318        }
    319
    320        if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
    321            /* if there was no old l2 table, clear the new slice */
    322            memset(l2_slice, 0, slice_size2);
    323        } else {
    324            uint64_t *old_slice;
    325            uint64_t old_l2_slice_offset =
    326                (old_l2_offset & L1E_OFFSET_MASK) + slice * slice_size2;
    327
    328            /* if there was an old l2 table, read a slice from the disk */
    329            BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
    330            ret = qcow2_cache_get(bs, s->l2_table_cache, old_l2_slice_offset,
    331                                  (void **) &old_slice);
    332            if (ret < 0) {
    333                goto fail;
    334            }
    335
    336            memcpy(l2_slice, old_slice, slice_size2);
    337
    338            qcow2_cache_put(s->l2_table_cache, (void **) &old_slice);
    339        }
    340
    341        /* write the l2 slice to the file */
    342        BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
    343
    344        trace_qcow2_l2_allocate_write_l2(bs, l1_index);
    345        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
    346        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
    347    }
    348
    349    ret = qcow2_cache_flush(bs, s->l2_table_cache);
    350    if (ret < 0) {
    351        goto fail;
    352    }
    353
    354    /* update the L1 entry */
    355    trace_qcow2_l2_allocate_write_l1(bs, l1_index);
    356    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
    357    ret = qcow2_write_l1_entry(bs, l1_index);
    358    if (ret < 0) {
    359        goto fail;
    360    }
    361
    362    trace_qcow2_l2_allocate_done(bs, l1_index, 0);
    363    return 0;
    364
    365fail:
    366    trace_qcow2_l2_allocate_done(bs, l1_index, ret);
    367    if (l2_slice != NULL) {
    368        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
    369    }
    370    s->l1_table[l1_index] = old_l2_offset;
    371    if (l2_offset > 0) {
    372        qcow2_free_clusters(bs, l2_offset, s->l2_size * l2_entry_size(s),
    373                            QCOW2_DISCARD_ALWAYS);
    374    }
    375    return ret;
    376}
    377
    378/*
    379 * For a given L2 entry, count the number of contiguous subclusters of
    380 * the same type starting from @sc_from. Compressed clusters are
    381 * treated as if they were divided into subclusters of size
    382 * s->subcluster_size.
    383 *
    384 * Return the number of contiguous subclusters and set @type to the
    385 * subcluster type.
    386 *
    387 * If the L2 entry is invalid return -errno and set @type to
    388 * QCOW2_SUBCLUSTER_INVALID.
    389 */
    390static int qcow2_get_subcluster_range_type(BlockDriverState *bs,
    391                                           uint64_t l2_entry,
    392                                           uint64_t l2_bitmap,
    393                                           unsigned sc_from,
    394                                           QCow2SubclusterType *type)
    395{
    396    BDRVQcow2State *s = bs->opaque;
    397    uint32_t val;
    398
    399    *type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_from);
    400
    401    if (*type == QCOW2_SUBCLUSTER_INVALID) {
    402        return -EINVAL;
    403    } else if (!has_subclusters(s) || *type == QCOW2_SUBCLUSTER_COMPRESSED) {
    404        return s->subclusters_per_cluster - sc_from;
    405    }
    406
    407    switch (*type) {
    408    case QCOW2_SUBCLUSTER_NORMAL:
    409        val = l2_bitmap | QCOW_OFLAG_SUB_ALLOC_RANGE(0, sc_from);
    410        return cto32(val) - sc_from;
    411
    412    case QCOW2_SUBCLUSTER_ZERO_PLAIN:
    413    case QCOW2_SUBCLUSTER_ZERO_ALLOC:
    414        val = (l2_bitmap | QCOW_OFLAG_SUB_ZERO_RANGE(0, sc_from)) >> 32;
    415        return cto32(val) - sc_from;
    416
    417    case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
    418    case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
    419        val = ((l2_bitmap >> 32) | l2_bitmap)
    420            & ~QCOW_OFLAG_SUB_ALLOC_RANGE(0, sc_from);
    421        return ctz32(val) - sc_from;
    422
    423    default:
    424        g_assert_not_reached();
    425    }
    426}
    427
    428/*
    429 * Return the number of contiguous subclusters of the exact same type
    430 * in a given L2 slice, starting from cluster @l2_index, subcluster
    431 * @sc_index. Allocated subclusters are required to be contiguous in
    432 * the image file.
    433 * At most @nb_clusters are checked (note that this means clusters,
    434 * not subclusters).
    435 * Compressed clusters are always processed one by one but for the
    436 * purpose of this count they are treated as if they were divided into
    437 * subclusters of size s->subcluster_size.
    438 * On failure return -errno and update @l2_index to point to the
    439 * invalid entry.
    440 */
    441static int count_contiguous_subclusters(BlockDriverState *bs, int nb_clusters,
    442                                        unsigned sc_index, uint64_t *l2_slice,
    443                                        unsigned *l2_index)
    444{
    445    BDRVQcow2State *s = bs->opaque;
    446    int i, count = 0;
    447    bool check_offset = false;
    448    uint64_t expected_offset = 0;
    449    QCow2SubclusterType expected_type = QCOW2_SUBCLUSTER_NORMAL, type;
    450
    451    assert(*l2_index + nb_clusters <= s->l2_slice_size);
    452
    453    for (i = 0; i < nb_clusters; i++) {
    454        unsigned first_sc = (i == 0) ? sc_index : 0;
    455        uint64_t l2_entry = get_l2_entry(s, l2_slice, *l2_index + i);
    456        uint64_t l2_bitmap = get_l2_bitmap(s, l2_slice, *l2_index + i);
    457        int ret = qcow2_get_subcluster_range_type(bs, l2_entry, l2_bitmap,
    458                                                  first_sc, &type);
    459        if (ret < 0) {
    460            *l2_index += i; /* Point to the invalid entry */
    461            return -EIO;
    462        }
    463        if (i == 0) {
    464            if (type == QCOW2_SUBCLUSTER_COMPRESSED) {
    465                /* Compressed clusters are always processed one by one */
    466                return ret;
    467            }
    468            expected_type = type;
    469            expected_offset = l2_entry & L2E_OFFSET_MASK;
    470            check_offset = (type == QCOW2_SUBCLUSTER_NORMAL ||
    471                            type == QCOW2_SUBCLUSTER_ZERO_ALLOC ||
    472                            type == QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC);
    473        } else if (type != expected_type) {
    474            break;
    475        } else if (check_offset) {
    476            expected_offset += s->cluster_size;
    477            if (expected_offset != (l2_entry & L2E_OFFSET_MASK)) {
    478                break;
    479            }
    480        }
    481        count += ret;
    482        /* Stop if there are type changes before the end of the cluster */
    483        if (first_sc + ret < s->subclusters_per_cluster) {
    484            break;
    485        }
    486    }
    487
    488    return count;
    489}
    490
    491static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
    492                                            uint64_t src_cluster_offset,
    493                                            unsigned offset_in_cluster,
    494                                            QEMUIOVector *qiov)
    495{
    496    int ret;
    497
    498    if (qiov->size == 0) {
    499        return 0;
    500    }
    501
    502    BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
    503
    504    if (!bs->drv) {
    505        return -ENOMEDIUM;
    506    }
    507
    508    /*
    509     * We never deal with requests that don't satisfy
    510     * bdrv_check_qiov_request(), and aligning requests to clusters never
    511     * breaks this condition. So, do some assertions before calling
    512     * bs->drv->bdrv_co_preadv_part() which has int64_t arguments.
    513     */
    514    assert(src_cluster_offset <= INT64_MAX);
    515    assert(src_cluster_offset + offset_in_cluster <= INT64_MAX);
    516    assert(qiov->size <= INT64_MAX);
    517    bdrv_check_qiov_request(src_cluster_offset + offset_in_cluster, qiov->size,
    518                            qiov, 0, &error_abort);
    519    /*
    520     * Call .bdrv_co_readv() directly instead of using the public block-layer
    521     * interface.  This avoids double I/O throttling and request tracking,
    522     * which can lead to deadlock when block layer copy-on-read is enabled.
    523     */
    524    ret = bs->drv->bdrv_co_preadv_part(bs,
    525                                       src_cluster_offset + offset_in_cluster,
    526                                       qiov->size, qiov, 0, 0);
    527    if (ret < 0) {
    528        return ret;
    529    }
    530
    531    return 0;
    532}
    533
    534static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
    535                                             uint64_t cluster_offset,
    536                                             unsigned offset_in_cluster,
    537                                             QEMUIOVector *qiov)
    538{
    539    BDRVQcow2State *s = bs->opaque;
    540    int ret;
    541
    542    if (qiov->size == 0) {
    543        return 0;
    544    }
    545
    546    ret = qcow2_pre_write_overlap_check(bs, 0,
    547            cluster_offset + offset_in_cluster, qiov->size, true);
    548    if (ret < 0) {
    549        return ret;
    550    }
    551
    552    BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
    553    ret = bdrv_co_pwritev(s->data_file, cluster_offset + offset_in_cluster,
    554                          qiov->size, qiov, 0);
    555    if (ret < 0) {
    556        return ret;
    557    }
    558
    559    return 0;
    560}
    561
    562
    563/*
    564 * get_host_offset
    565 *
    566 * For a given offset of the virtual disk find the equivalent host
    567 * offset in the qcow2 file and store it in *host_offset. Neither
    568 * offset needs to be aligned to a cluster boundary.
    569 *
    570 * If the cluster is unallocated then *host_offset will be 0.
    571 * If the cluster is compressed then *host_offset will contain the l2 entry.
    572 *
    573 * On entry, *bytes is the maximum number of contiguous bytes starting at
    574 * offset that we are interested in.
    575 *
    576 * On exit, *bytes is the number of bytes starting at offset that have the same
    577 * subcluster type and (if applicable) are stored contiguously in the image
    578 * file. The subcluster type is stored in *subcluster_type.
    579 * Compressed clusters are always processed one by one.
    580 *
    581 * Returns 0 on success, -errno in error cases.
    582 */
    583int qcow2_get_host_offset(BlockDriverState *bs, uint64_t offset,
    584                          unsigned int *bytes, uint64_t *host_offset,
    585                          QCow2SubclusterType *subcluster_type)
    586{
    587    BDRVQcow2State *s = bs->opaque;
    588    unsigned int l2_index, sc_index;
    589    uint64_t l1_index, l2_offset, *l2_slice, l2_entry, l2_bitmap;
    590    int sc;
    591    unsigned int offset_in_cluster;
    592    uint64_t bytes_available, bytes_needed, nb_clusters;
    593    QCow2SubclusterType type;
    594    int ret;
    595
    596    offset_in_cluster = offset_into_cluster(s, offset);
    597    bytes_needed = (uint64_t) *bytes + offset_in_cluster;
    598
    599    /* compute how many bytes there are between the start of the cluster
    600     * containing offset and the end of the l2 slice that contains
    601     * the entry pointing to it */
    602    bytes_available =
    603        ((uint64_t) (s->l2_slice_size - offset_to_l2_slice_index(s, offset)))
    604        << s->cluster_bits;
    605
    606    if (bytes_needed > bytes_available) {
    607        bytes_needed = bytes_available;
    608    }
    609
    610    *host_offset = 0;
    611
    612    /* seek to the l2 offset in the l1 table */
    613
    614    l1_index = offset_to_l1_index(s, offset);
    615    if (l1_index >= s->l1_size) {
    616        type = QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN;
    617        goto out;
    618    }
    619
    620    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
    621    if (!l2_offset) {
    622        type = QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN;
    623        goto out;
    624    }
    625
    626    if (offset_into_cluster(s, l2_offset)) {
    627        qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
    628                                " unaligned (L1 index: %#" PRIx64 ")",
    629                                l2_offset, l1_index);
    630        return -EIO;
    631    }
    632
    633    /* load the l2 slice in memory */
    634
    635    ret = l2_load(bs, offset, l2_offset, &l2_slice);
    636    if (ret < 0) {
    637        return ret;
    638    }
    639
    640    /* find the cluster offset for the given disk offset */
    641
    642    l2_index = offset_to_l2_slice_index(s, offset);
    643    sc_index = offset_to_sc_index(s, offset);
    644    l2_entry = get_l2_entry(s, l2_slice, l2_index);
    645    l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
    646
    647    nb_clusters = size_to_clusters(s, bytes_needed);
    648    /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned
    649     * integers; the minimum cluster size is 512, so this assertion is always
    650     * true */
    651    assert(nb_clusters <= INT_MAX);
    652
    653    type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
    654    if (s->qcow_version < 3 && (type == QCOW2_SUBCLUSTER_ZERO_PLAIN ||
    655                                type == QCOW2_SUBCLUSTER_ZERO_ALLOC)) {
    656        qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found"
    657                                " in pre-v3 image (L2 offset: %#" PRIx64
    658                                ", L2 index: %#x)", l2_offset, l2_index);
    659        ret = -EIO;
    660        goto fail;
    661    }
    662    switch (type) {
    663    case QCOW2_SUBCLUSTER_INVALID:
    664        break; /* This is handled by count_contiguous_subclusters() below */
    665    case QCOW2_SUBCLUSTER_COMPRESSED:
    666        if (has_data_file(bs)) {
    667            qcow2_signal_corruption(bs, true, -1, -1, "Compressed cluster "
    668                                    "entry found in image with external data "
    669                                    "file (L2 offset: %#" PRIx64 ", L2 index: "
    670                                    "%#x)", l2_offset, l2_index);
    671            ret = -EIO;
    672            goto fail;
    673        }
    674        *host_offset = l2_entry;
    675        break;
    676    case QCOW2_SUBCLUSTER_ZERO_PLAIN:
    677    case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
    678        break;
    679    case QCOW2_SUBCLUSTER_ZERO_ALLOC:
    680    case QCOW2_SUBCLUSTER_NORMAL:
    681    case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC: {
    682        uint64_t host_cluster_offset = l2_entry & L2E_OFFSET_MASK;
    683        *host_offset = host_cluster_offset + offset_in_cluster;
    684        if (offset_into_cluster(s, host_cluster_offset)) {
    685            qcow2_signal_corruption(bs, true, -1, -1,
    686                                    "Cluster allocation offset %#"
    687                                    PRIx64 " unaligned (L2 offset: %#" PRIx64
    688                                    ", L2 index: %#x)", host_cluster_offset,
    689                                    l2_offset, l2_index);
    690            ret = -EIO;
    691            goto fail;
    692        }
    693        if (has_data_file(bs) && *host_offset != offset) {
    694            qcow2_signal_corruption(bs, true, -1, -1,
    695                                    "External data file host cluster offset %#"
    696                                    PRIx64 " does not match guest cluster "
    697                                    "offset: %#" PRIx64
    698                                    ", L2 index: %#x)", host_cluster_offset,
    699                                    offset - offset_in_cluster, l2_index);
    700            ret = -EIO;
    701            goto fail;
    702        }
    703        break;
    704    }
    705    default:
    706        abort();
    707    }
    708
    709    sc = count_contiguous_subclusters(bs, nb_clusters, sc_index,
    710                                      l2_slice, &l2_index);
    711    if (sc < 0) {
    712        qcow2_signal_corruption(bs, true, -1, -1, "Invalid cluster entry found "
    713                                " (L2 offset: %#" PRIx64 ", L2 index: %#x)",
    714                                l2_offset, l2_index);
    715        ret = -EIO;
    716        goto fail;
    717    }
    718    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
    719
    720    bytes_available = ((int64_t)sc + sc_index) << s->subcluster_bits;
    721
    722out:
    723    if (bytes_available > bytes_needed) {
    724        bytes_available = bytes_needed;
    725    }
    726
    727    /* bytes_available <= bytes_needed <= *bytes + offset_in_cluster;
    728     * subtracting offset_in_cluster will therefore definitely yield something
    729     * not exceeding UINT_MAX */
    730    assert(bytes_available - offset_in_cluster <= UINT_MAX);
    731    *bytes = bytes_available - offset_in_cluster;
    732
    733    *subcluster_type = type;
    734
    735    return 0;
    736
    737fail:
    738    qcow2_cache_put(s->l2_table_cache, (void **)&l2_slice);
    739    return ret;
    740}
    741
    742/*
    743 * get_cluster_table
    744 *
    745 * for a given disk offset, load (and allocate if needed)
    746 * the appropriate slice of its l2 table.
    747 *
    748 * the cluster index in the l2 slice is given to the caller.
    749 *
    750 * Returns 0 on success, -errno in failure case
    751 */
    752static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
    753                             uint64_t **new_l2_slice,
    754                             int *new_l2_index)
    755{
    756    BDRVQcow2State *s = bs->opaque;
    757    unsigned int l2_index;
    758    uint64_t l1_index, l2_offset;
    759    uint64_t *l2_slice = NULL;
    760    int ret;
    761
    762    /* seek to the l2 offset in the l1 table */
    763
    764    l1_index = offset_to_l1_index(s, offset);
    765    if (l1_index >= s->l1_size) {
    766        ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
    767        if (ret < 0) {
    768            return ret;
    769        }
    770    }
    771
    772    assert(l1_index < s->l1_size);
    773    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
    774    if (offset_into_cluster(s, l2_offset)) {
    775        qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
    776                                " unaligned (L1 index: %#" PRIx64 ")",
    777                                l2_offset, l1_index);
    778        return -EIO;
    779    }
    780
    781    if (!(s->l1_table[l1_index] & QCOW_OFLAG_COPIED)) {
    782        /* First allocate a new L2 table (and do COW if needed) */
    783        ret = l2_allocate(bs, l1_index);
    784        if (ret < 0) {
    785            return ret;
    786        }
    787
    788        /* Then decrease the refcount of the old table */
    789        if (l2_offset) {
    790            qcow2_free_clusters(bs, l2_offset, s->l2_size * l2_entry_size(s),
    791                                QCOW2_DISCARD_OTHER);
    792        }
    793
    794        /* Get the offset of the newly-allocated l2 table */
    795        l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
    796        assert(offset_into_cluster(s, l2_offset) == 0);
    797    }
    798
    799    /* load the l2 slice in memory */
    800    ret = l2_load(bs, offset, l2_offset, &l2_slice);
    801    if (ret < 0) {
    802        return ret;
    803    }
    804
    805    /* find the cluster offset for the given disk offset */
    806
    807    l2_index = offset_to_l2_slice_index(s, offset);
    808
    809    *new_l2_slice = l2_slice;
    810    *new_l2_index = l2_index;
    811
    812    return 0;
    813}
    814
    815/*
    816 * alloc_compressed_cluster_offset
    817 *
    818 * For a given offset on the virtual disk, allocate a new compressed cluster
    819 * and put the host offset of the cluster into *host_offset. If a cluster is
    820 * already allocated at the offset, return an error.
    821 *
    822 * Return 0 on success and -errno in error cases
    823 */
    824int qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
    825                                          uint64_t offset,
    826                                          int compressed_size,
    827                                          uint64_t *host_offset)
    828{
    829    BDRVQcow2State *s = bs->opaque;
    830    int l2_index, ret;
    831    uint64_t *l2_slice;
    832    int64_t cluster_offset;
    833    int nb_csectors;
    834
    835    if (has_data_file(bs)) {
    836        return 0;
    837    }
    838
    839    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
    840    if (ret < 0) {
    841        return ret;
    842    }
    843
    844    /* Compression can't overwrite anything. Fail if the cluster was already
    845     * allocated. */
    846    cluster_offset = get_l2_entry(s, l2_slice, l2_index);
    847    if (cluster_offset & L2E_OFFSET_MASK) {
    848        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
    849        return -EIO;
    850    }
    851
    852    cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
    853    if (cluster_offset < 0) {
    854        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
    855        return cluster_offset;
    856    }
    857
    858    nb_csectors =
    859        (cluster_offset + compressed_size - 1) / QCOW2_COMPRESSED_SECTOR_SIZE -
    860        (cluster_offset / QCOW2_COMPRESSED_SECTOR_SIZE);
    861
    862    /* The offset and size must fit in their fields of the L2 table entry */
    863    assert((cluster_offset & s->cluster_offset_mask) == cluster_offset);
    864    assert((nb_csectors & s->csize_mask) == nb_csectors);
    865
    866    cluster_offset |= QCOW_OFLAG_COMPRESSED |
    867                      ((uint64_t)nb_csectors << s->csize_shift);
    868
    869    /* update L2 table */
    870
    871    /* compressed clusters never have the copied flag */
    872
    873    BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
    874    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
    875    set_l2_entry(s, l2_slice, l2_index, cluster_offset);
    876    if (has_subclusters(s)) {
    877        set_l2_bitmap(s, l2_slice, l2_index, 0);
    878    }
    879    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
    880
    881    *host_offset = cluster_offset & s->cluster_offset_mask;
    882    return 0;
    883}
    884
    885static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
    886{
    887    BDRVQcow2State *s = bs->opaque;
    888    Qcow2COWRegion *start = &m->cow_start;
    889    Qcow2COWRegion *end = &m->cow_end;
    890    unsigned buffer_size;
    891    unsigned data_bytes = end->offset - (start->offset + start->nb_bytes);
    892    bool merge_reads;
    893    uint8_t *start_buffer, *end_buffer;
    894    QEMUIOVector qiov;
    895    int ret;
    896
    897    assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
    898    assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes);
    899    assert(start->offset + start->nb_bytes <= end->offset);
    900
    901    if ((start->nb_bytes == 0 && end->nb_bytes == 0) || m->skip_cow) {
    902        return 0;
    903    }
    904
    905    /* If we have to read both the start and end COW regions and the
    906     * middle region is not too large then perform just one read
    907     * operation */
    908    merge_reads = start->nb_bytes && end->nb_bytes && data_bytes <= 16384;
    909    if (merge_reads) {
    910        buffer_size = start->nb_bytes + data_bytes + end->nb_bytes;
    911    } else {
    912        /* If we have to do two reads, add some padding in the middle
    913         * if necessary to make sure that the end region is optimally
    914         * aligned. */
    915        size_t align = bdrv_opt_mem_align(bs);
    916        assert(align > 0 && align <= UINT_MAX);
    917        assert(QEMU_ALIGN_UP(start->nb_bytes, align) <=
    918               UINT_MAX - end->nb_bytes);
    919        buffer_size = QEMU_ALIGN_UP(start->nb_bytes, align) + end->nb_bytes;
    920    }
    921
    922    /* Reserve a buffer large enough to store all the data that we're
    923     * going to read */
    924    start_buffer = qemu_try_blockalign(bs, buffer_size);
    925    if (start_buffer == NULL) {
    926        return -ENOMEM;
    927    }
    928    /* The part of the buffer where the end region is located */
    929    end_buffer = start_buffer + buffer_size - end->nb_bytes;
    930
    931    qemu_iovec_init(&qiov, 2 + (m->data_qiov ?
    932                                qemu_iovec_subvec_niov(m->data_qiov,
    933                                                       m->data_qiov_offset,
    934                                                       data_bytes)
    935                                : 0));
    936
    937    qemu_co_mutex_unlock(&s->lock);
    938    /* First we read the existing data from both COW regions. We
    939     * either read the whole region in one go, or the start and end
    940     * regions separately. */
    941    if (merge_reads) {
    942        qemu_iovec_add(&qiov, start_buffer, buffer_size);
    943        ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
    944    } else {
    945        qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
    946        ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
    947        if (ret < 0) {
    948            goto fail;
    949        }
    950
    951        qemu_iovec_reset(&qiov);
    952        qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
    953        ret = do_perform_cow_read(bs, m->offset, end->offset, &qiov);
    954    }
    955    if (ret < 0) {
    956        goto fail;
    957    }
    958
    959    /* Encrypt the data if necessary before writing it */
    960    if (bs->encrypted) {
    961        ret = qcow2_co_encrypt(bs,
    962                               m->alloc_offset + start->offset,
    963                               m->offset + start->offset,
    964                               start_buffer, start->nb_bytes);
    965        if (ret < 0) {
    966            goto fail;
    967        }
    968
    969        ret = qcow2_co_encrypt(bs,
    970                               m->alloc_offset + end->offset,
    971                               m->offset + end->offset,
    972                               end_buffer, end->nb_bytes);
    973        if (ret < 0) {
    974            goto fail;
    975        }
    976    }
    977
    978    /* And now we can write everything. If we have the guest data we
    979     * can write everything in one single operation */
    980    if (m->data_qiov) {
    981        qemu_iovec_reset(&qiov);
    982        if (start->nb_bytes) {
    983            qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
    984        }
    985        qemu_iovec_concat(&qiov, m->data_qiov, m->data_qiov_offset, data_bytes);
    986        if (end->nb_bytes) {
    987            qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
    988        }
    989        /* NOTE: we have a write_aio blkdebug event here followed by
    990         * a cow_write one in do_perform_cow_write(), but there's only
    991         * one single I/O operation */
    992        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
    993        ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
    994    } else {
    995        /* If there's no guest data then write both COW regions separately */
    996        qemu_iovec_reset(&qiov);
    997        qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
    998        ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
    999        if (ret < 0) {
   1000            goto fail;
   1001        }
   1002
   1003        qemu_iovec_reset(&qiov);
   1004        qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
   1005        ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
   1006    }
   1007
   1008fail:
   1009    qemu_co_mutex_lock(&s->lock);
   1010
   1011    /*
   1012     * Before we update the L2 table to actually point to the new cluster, we
   1013     * need to be sure that the refcounts have been increased and COW was
   1014     * handled.
   1015     */
   1016    if (ret == 0) {
   1017        qcow2_cache_depends_on_flush(s->l2_table_cache);
   1018    }
   1019
   1020    qemu_vfree(start_buffer);
   1021    qemu_iovec_destroy(&qiov);
   1022    return ret;
   1023}
   1024
   1025int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
   1026{
   1027    BDRVQcow2State *s = bs->opaque;
   1028    int i, j = 0, l2_index, ret;
   1029    uint64_t *old_cluster, *l2_slice;
   1030    uint64_t cluster_offset = m->alloc_offset;
   1031
   1032    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
   1033    assert(m->nb_clusters > 0);
   1034
   1035    old_cluster = g_try_new(uint64_t, m->nb_clusters);
   1036    if (old_cluster == NULL) {
   1037        ret = -ENOMEM;
   1038        goto err;
   1039    }
   1040
   1041    /* copy content of unmodified sectors */
   1042    ret = perform_cow(bs, m);
   1043    if (ret < 0) {
   1044        goto err;
   1045    }
   1046
   1047    /* Update L2 table. */
   1048    if (s->use_lazy_refcounts) {
   1049        qcow2_mark_dirty(bs);
   1050    }
   1051    if (qcow2_need_accurate_refcounts(s)) {
   1052        qcow2_cache_set_dependency(bs, s->l2_table_cache,
   1053                                   s->refcount_block_cache);
   1054    }
   1055
   1056    ret = get_cluster_table(bs, m->offset, &l2_slice, &l2_index);
   1057    if (ret < 0) {
   1058        goto err;
   1059    }
   1060    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
   1061
   1062    assert(l2_index + m->nb_clusters <= s->l2_slice_size);
   1063    assert(m->cow_end.offset + m->cow_end.nb_bytes <=
   1064           m->nb_clusters << s->cluster_bits);
   1065    for (i = 0; i < m->nb_clusters; i++) {
   1066        uint64_t offset = cluster_offset + ((uint64_t)i << s->cluster_bits);
   1067        /* if two concurrent writes happen to the same unallocated cluster
   1068         * each write allocates separate cluster and writes data concurrently.
   1069         * The first one to complete updates l2 table with pointer to its
   1070         * cluster the second one has to do RMW (which is done above by
   1071         * perform_cow()), update l2 table with its cluster pointer and free
   1072         * old cluster. This is what this loop does */
   1073        if (get_l2_entry(s, l2_slice, l2_index + i) != 0) {
   1074            old_cluster[j++] = get_l2_entry(s, l2_slice, l2_index + i);
   1075        }
   1076
   1077        /* The offset must fit in the offset field of the L2 table entry */
   1078        assert((offset & L2E_OFFSET_MASK) == offset);
   1079
   1080        set_l2_entry(s, l2_slice, l2_index + i, offset | QCOW_OFLAG_COPIED);
   1081
   1082        /* Update bitmap with the subclusters that were just written */
   1083        if (has_subclusters(s) && !m->prealloc) {
   1084            uint64_t l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
   1085            unsigned written_from = m->cow_start.offset;
   1086            unsigned written_to = m->cow_end.offset + m->cow_end.nb_bytes;
   1087            int first_sc, last_sc;
   1088            /* Narrow written_from and written_to down to the current cluster */
   1089            written_from = MAX(written_from, i << s->cluster_bits);
   1090            written_to   = MIN(written_to, (i + 1) << s->cluster_bits);
   1091            assert(written_from < written_to);
   1092            first_sc = offset_to_sc_index(s, written_from);
   1093            last_sc  = offset_to_sc_index(s, written_to - 1);
   1094            l2_bitmap |= QCOW_OFLAG_SUB_ALLOC_RANGE(first_sc, last_sc + 1);
   1095            l2_bitmap &= ~QCOW_OFLAG_SUB_ZERO_RANGE(first_sc, last_sc + 1);
   1096            set_l2_bitmap(s, l2_slice, l2_index + i, l2_bitmap);
   1097        }
   1098     }
   1099
   1100
   1101    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
   1102
   1103    /*
   1104     * If this was a COW, we need to decrease the refcount of the old cluster.
   1105     *
   1106     * Don't discard clusters that reach a refcount of 0 (e.g. compressed
   1107     * clusters), the next write will reuse them anyway.
   1108     */
   1109    if (!m->keep_old_clusters && j != 0) {
   1110        for (i = 0; i < j; i++) {
   1111            qcow2_free_any_cluster(bs, old_cluster[i], QCOW2_DISCARD_NEVER);
   1112        }
   1113    }
   1114
   1115    ret = 0;
   1116err:
   1117    g_free(old_cluster);
   1118    return ret;
   1119 }
   1120
   1121/**
   1122 * Frees the allocated clusters because the request failed and they won't
   1123 * actually be linked.
   1124 */
   1125void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m)
   1126{
   1127    BDRVQcow2State *s = bs->opaque;
   1128    if (!has_data_file(bs) && !m->keep_old_clusters) {
   1129        qcow2_free_clusters(bs, m->alloc_offset,
   1130                            m->nb_clusters << s->cluster_bits,
   1131                            QCOW2_DISCARD_NEVER);
   1132    }
   1133}
   1134
   1135/*
   1136 * For a given write request, create a new QCowL2Meta structure, add
   1137 * it to @m and the BDRVQcow2State.cluster_allocs list. If the write
   1138 * request does not need copy-on-write or changes to the L2 metadata
   1139 * then this function does nothing.
   1140 *
   1141 * @host_cluster_offset points to the beginning of the first cluster.
   1142 *
   1143 * @guest_offset and @bytes indicate the offset and length of the
   1144 * request.
   1145 *
   1146 * @l2_slice contains the L2 entries of all clusters involved in this
   1147 * write request.
   1148 *
   1149 * If @keep_old is true it means that the clusters were already
   1150 * allocated and will be overwritten. If false then the clusters are
   1151 * new and we have to decrease the reference count of the old ones.
   1152 *
   1153 * Returns 0 on success, -errno on failure.
   1154 */
   1155static int calculate_l2_meta(BlockDriverState *bs, uint64_t host_cluster_offset,
   1156                             uint64_t guest_offset, unsigned bytes,
   1157                             uint64_t *l2_slice, QCowL2Meta **m, bool keep_old)
   1158{
   1159    BDRVQcow2State *s = bs->opaque;
   1160    int sc_index, l2_index = offset_to_l2_slice_index(s, guest_offset);
   1161    uint64_t l2_entry, l2_bitmap;
   1162    unsigned cow_start_from, cow_end_to;
   1163    unsigned cow_start_to = offset_into_cluster(s, guest_offset);
   1164    unsigned cow_end_from = cow_start_to + bytes;
   1165    unsigned nb_clusters = size_to_clusters(s, cow_end_from);
   1166    QCowL2Meta *old_m = *m;
   1167    QCow2SubclusterType type;
   1168    int i;
   1169    bool skip_cow = keep_old;
   1170
   1171    assert(nb_clusters <= s->l2_slice_size - l2_index);
   1172
   1173    /* Check the type of all affected subclusters */
   1174    for (i = 0; i < nb_clusters; i++) {
   1175        l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
   1176        l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
   1177        if (skip_cow) {
   1178            unsigned write_from = MAX(cow_start_to, i << s->cluster_bits);
   1179            unsigned write_to = MIN(cow_end_from, (i + 1) << s->cluster_bits);
   1180            int first_sc = offset_to_sc_index(s, write_from);
   1181            int last_sc = offset_to_sc_index(s, write_to - 1);
   1182            int cnt = qcow2_get_subcluster_range_type(bs, l2_entry, l2_bitmap,
   1183                                                      first_sc, &type);
   1184            /* Is any of the subclusters of type != QCOW2_SUBCLUSTER_NORMAL ? */
   1185            if (type != QCOW2_SUBCLUSTER_NORMAL || first_sc + cnt <= last_sc) {
   1186                skip_cow = false;
   1187            }
   1188        } else {
   1189            /* If we can't skip the cow we can still look for invalid entries */
   1190            type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, 0);
   1191        }
   1192        if (type == QCOW2_SUBCLUSTER_INVALID) {
   1193            int l1_index = offset_to_l1_index(s, guest_offset);
   1194            uint64_t l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
   1195            qcow2_signal_corruption(bs, true, -1, -1, "Invalid cluster "
   1196                                    "entry found (L2 offset: %#" PRIx64
   1197                                    ", L2 index: %#x)",
   1198                                    l2_offset, l2_index + i);
   1199            return -EIO;
   1200        }
   1201    }
   1202
   1203    if (skip_cow) {
   1204        return 0;
   1205    }
   1206
   1207    /* Get the L2 entry of the first cluster */
   1208    l2_entry = get_l2_entry(s, l2_slice, l2_index);
   1209    l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
   1210    sc_index = offset_to_sc_index(s, guest_offset);
   1211    type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
   1212
   1213    if (!keep_old) {
   1214        switch (type) {
   1215        case QCOW2_SUBCLUSTER_COMPRESSED:
   1216            cow_start_from = 0;
   1217            break;
   1218        case QCOW2_SUBCLUSTER_NORMAL:
   1219        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
   1220        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
   1221            if (has_subclusters(s)) {
   1222                /* Skip all leading zero and unallocated subclusters */
   1223                uint32_t alloc_bitmap = l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC;
   1224                cow_start_from =
   1225                    MIN(sc_index, ctz32(alloc_bitmap)) << s->subcluster_bits;
   1226            } else {
   1227                cow_start_from = 0;
   1228            }
   1229            break;
   1230        case QCOW2_SUBCLUSTER_ZERO_PLAIN:
   1231        case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
   1232            cow_start_from = sc_index << s->subcluster_bits;
   1233            break;
   1234        default:
   1235            g_assert_not_reached();
   1236        }
   1237    } else {
   1238        switch (type) {
   1239        case QCOW2_SUBCLUSTER_NORMAL:
   1240            cow_start_from = cow_start_to;
   1241            break;
   1242        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
   1243        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
   1244            cow_start_from = sc_index << s->subcluster_bits;
   1245            break;
   1246        default:
   1247            g_assert_not_reached();
   1248        }
   1249    }
   1250
   1251    /* Get the L2 entry of the last cluster */
   1252    l2_index += nb_clusters - 1;
   1253    l2_entry = get_l2_entry(s, l2_slice, l2_index);
   1254    l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
   1255    sc_index = offset_to_sc_index(s, guest_offset + bytes - 1);
   1256    type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
   1257
   1258    if (!keep_old) {
   1259        switch (type) {
   1260        case QCOW2_SUBCLUSTER_COMPRESSED:
   1261            cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
   1262            break;
   1263        case QCOW2_SUBCLUSTER_NORMAL:
   1264        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
   1265        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
   1266            cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
   1267            if (has_subclusters(s)) {
   1268                /* Skip all trailing zero and unallocated subclusters */
   1269                uint32_t alloc_bitmap = l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC;
   1270                cow_end_to -=
   1271                    MIN(s->subclusters_per_cluster - sc_index - 1,
   1272                        clz32(alloc_bitmap)) << s->subcluster_bits;
   1273            }
   1274            break;
   1275        case QCOW2_SUBCLUSTER_ZERO_PLAIN:
   1276        case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
   1277            cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size);
   1278            break;
   1279        default:
   1280            g_assert_not_reached();
   1281        }
   1282    } else {
   1283        switch (type) {
   1284        case QCOW2_SUBCLUSTER_NORMAL:
   1285            cow_end_to = cow_end_from;
   1286            break;
   1287        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
   1288        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
   1289            cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size);
   1290            break;
   1291        default:
   1292            g_assert_not_reached();
   1293        }
   1294    }
   1295
   1296    *m = g_malloc0(sizeof(**m));
   1297    **m = (QCowL2Meta) {
   1298        .next           = old_m,
   1299
   1300        .alloc_offset   = host_cluster_offset,
   1301        .offset         = start_of_cluster(s, guest_offset),
   1302        .nb_clusters    = nb_clusters,
   1303
   1304        .keep_old_clusters = keep_old,
   1305
   1306        .cow_start = {
   1307            .offset     = cow_start_from,
   1308            .nb_bytes   = cow_start_to - cow_start_from,
   1309        },
   1310        .cow_end = {
   1311            .offset     = cow_end_from,
   1312            .nb_bytes   = cow_end_to - cow_end_from,
   1313        },
   1314    };
   1315
   1316    qemu_co_queue_init(&(*m)->dependent_requests);
   1317    QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
   1318
   1319    return 0;
   1320}
   1321
   1322/*
   1323 * Returns true if writing to the cluster pointed to by @l2_entry
   1324 * requires a new allocation (that is, if the cluster is unallocated
   1325 * or has refcount > 1 and therefore cannot be written in-place).
   1326 */
   1327static bool cluster_needs_new_alloc(BlockDriverState *bs, uint64_t l2_entry)
   1328{
   1329    switch (qcow2_get_cluster_type(bs, l2_entry)) {
   1330    case QCOW2_CLUSTER_NORMAL:
   1331    case QCOW2_CLUSTER_ZERO_ALLOC:
   1332        if (l2_entry & QCOW_OFLAG_COPIED) {
   1333            return false;
   1334        }
   1335        /* fallthrough */
   1336    case QCOW2_CLUSTER_UNALLOCATED:
   1337    case QCOW2_CLUSTER_COMPRESSED:
   1338    case QCOW2_CLUSTER_ZERO_PLAIN:
   1339        return true;
   1340    default:
   1341        abort();
   1342    }
   1343}
   1344
   1345/*
   1346 * Returns the number of contiguous clusters that can be written to
   1347 * using one single write request, starting from @l2_index.
   1348 * At most @nb_clusters are checked.
   1349 *
   1350 * If @new_alloc is true this counts clusters that are either
   1351 * unallocated, or allocated but with refcount > 1 (so they need to be
   1352 * newly allocated and COWed).
   1353 *
   1354 * If @new_alloc is false this counts clusters that are already
   1355 * allocated and can be overwritten in-place (this includes clusters
   1356 * of type QCOW2_CLUSTER_ZERO_ALLOC).
   1357 */
   1358static int count_single_write_clusters(BlockDriverState *bs, int nb_clusters,
   1359                                       uint64_t *l2_slice, int l2_index,
   1360                                       bool new_alloc)
   1361{
   1362    BDRVQcow2State *s = bs->opaque;
   1363    uint64_t l2_entry = get_l2_entry(s, l2_slice, l2_index);
   1364    uint64_t expected_offset = l2_entry & L2E_OFFSET_MASK;
   1365    int i;
   1366
   1367    for (i = 0; i < nb_clusters; i++) {
   1368        l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
   1369        if (cluster_needs_new_alloc(bs, l2_entry) != new_alloc) {
   1370            break;
   1371        }
   1372        if (!new_alloc) {
   1373            if (expected_offset != (l2_entry & L2E_OFFSET_MASK)) {
   1374                break;
   1375            }
   1376            expected_offset += s->cluster_size;
   1377        }
   1378    }
   1379
   1380    assert(i <= nb_clusters);
   1381    return i;
   1382}
   1383
   1384/*
   1385 * Check if there already is an AIO write request in flight which allocates
   1386 * the same cluster. In this case we need to wait until the previous
   1387 * request has completed and updated the L2 table accordingly.
   1388 *
   1389 * Returns:
   1390 *   0       if there was no dependency. *cur_bytes indicates the number of
   1391 *           bytes from guest_offset that can be read before the next
   1392 *           dependency must be processed (or the request is complete)
   1393 *
   1394 *   -EAGAIN if we had to wait for another request, previously gathered
   1395 *           information on cluster allocation may be invalid now. The caller
   1396 *           must start over anyway, so consider *cur_bytes undefined.
   1397 */
   1398static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
   1399    uint64_t *cur_bytes, QCowL2Meta **m)
   1400{
   1401    BDRVQcow2State *s = bs->opaque;
   1402    QCowL2Meta *old_alloc;
   1403    uint64_t bytes = *cur_bytes;
   1404
   1405    QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
   1406
   1407        uint64_t start = guest_offset;
   1408        uint64_t end = start + bytes;
   1409        uint64_t old_start = start_of_cluster(s, l2meta_cow_start(old_alloc));
   1410        uint64_t old_end = ROUND_UP(l2meta_cow_end(old_alloc), s->cluster_size);
   1411
   1412        if (end <= old_start || start >= old_end) {
   1413            /* No intersection */
   1414            continue;
   1415        }
   1416
   1417        if (old_alloc->keep_old_clusters &&
   1418            (end <= l2meta_cow_start(old_alloc) ||
   1419             start >= l2meta_cow_end(old_alloc)))
   1420        {
   1421            /*
   1422             * Clusters intersect but COW areas don't. And cluster itself is
   1423             * already allocated. So, there is no actual conflict.
   1424             */
   1425            continue;
   1426        }
   1427
   1428        /* Conflict */
   1429
   1430        if (start < old_start) {
   1431            /* Stop at the start of a running allocation */
   1432            bytes = old_start - start;
   1433        } else {
   1434            bytes = 0;
   1435        }
   1436
   1437        /*
   1438         * Stop if an l2meta already exists. After yielding, it wouldn't
   1439         * be valid any more, so we'd have to clean up the old L2Metas
   1440         * and deal with requests depending on them before starting to
   1441         * gather new ones. Not worth the trouble.
   1442         */
   1443        if (bytes == 0 && *m) {
   1444            *cur_bytes = 0;
   1445            return 0;
   1446        }
   1447
   1448        if (bytes == 0) {
   1449            /*
   1450             * Wait for the dependency to complete. We need to recheck
   1451             * the free/allocated clusters when we continue.
   1452             */
   1453            qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
   1454            return -EAGAIN;
   1455        }
   1456    }
   1457
   1458    /* Make sure that existing clusters and new allocations are only used up to
   1459     * the next dependency if we shortened the request above */
   1460    *cur_bytes = bytes;
   1461
   1462    return 0;
   1463}
   1464
   1465/*
   1466 * Checks how many already allocated clusters that don't require a new
   1467 * allocation there are at the given guest_offset (up to *bytes).
   1468 * If *host_offset is not INV_OFFSET, only physically contiguous clusters
   1469 * beginning at this host offset are counted.
   1470 *
   1471 * Note that guest_offset may not be cluster aligned. In this case, the
   1472 * returned *host_offset points to exact byte referenced by guest_offset and
   1473 * therefore isn't cluster aligned as well.
   1474 *
   1475 * Returns:
   1476 *   0:     if no allocated clusters are available at the given offset.
   1477 *          *bytes is normally unchanged. It is set to 0 if the cluster
   1478 *          is allocated and can be overwritten in-place but doesn't have
   1479 *          the right physical offset.
   1480 *
   1481 *   1:     if allocated clusters that can be overwritten in place are
   1482 *          available at the requested offset. *bytes may have decreased
   1483 *          and describes the length of the area that can be written to.
   1484 *
   1485 *  -errno: in error cases
   1486 */
   1487static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
   1488    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
   1489{
   1490    BDRVQcow2State *s = bs->opaque;
   1491    int l2_index;
   1492    uint64_t l2_entry, cluster_offset;
   1493    uint64_t *l2_slice;
   1494    uint64_t nb_clusters;
   1495    unsigned int keep_clusters;
   1496    int ret;
   1497
   1498    trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
   1499                              *bytes);
   1500
   1501    assert(*host_offset == INV_OFFSET || offset_into_cluster(s, guest_offset)
   1502                                      == offset_into_cluster(s, *host_offset));
   1503
   1504    /*
   1505     * Calculate the number of clusters to look for. We stop at L2 slice
   1506     * boundaries to keep things simple.
   1507     */
   1508    nb_clusters =
   1509        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
   1510
   1511    l2_index = offset_to_l2_slice_index(s, guest_offset);
   1512    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
   1513    /* Limit total byte count to BDRV_REQUEST_MAX_BYTES */
   1514    nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits);
   1515
   1516    /* Find L2 entry for the first involved cluster */
   1517    ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
   1518    if (ret < 0) {
   1519        return ret;
   1520    }
   1521
   1522    l2_entry = get_l2_entry(s, l2_slice, l2_index);
   1523    cluster_offset = l2_entry & L2E_OFFSET_MASK;
   1524
   1525    if (!cluster_needs_new_alloc(bs, l2_entry)) {
   1526        if (offset_into_cluster(s, cluster_offset)) {
   1527            qcow2_signal_corruption(bs, true, -1, -1, "%s cluster offset "
   1528                                    "%#" PRIx64 " unaligned (guest offset: %#"
   1529                                    PRIx64 ")", l2_entry & QCOW_OFLAG_ZERO ?
   1530                                    "Preallocated zero" : "Data",
   1531                                    cluster_offset, guest_offset);
   1532            ret = -EIO;
   1533            goto out;
   1534        }
   1535
   1536        /* If a specific host_offset is required, check it */
   1537        if (*host_offset != INV_OFFSET && cluster_offset != *host_offset) {
   1538            *bytes = 0;
   1539            ret = 0;
   1540            goto out;
   1541        }
   1542
   1543        /* We keep all QCOW_OFLAG_COPIED clusters */
   1544        keep_clusters = count_single_write_clusters(bs, nb_clusters, l2_slice,
   1545                                                    l2_index, false);
   1546        assert(keep_clusters <= nb_clusters);
   1547
   1548        *bytes = MIN(*bytes,
   1549                 keep_clusters * s->cluster_size
   1550                 - offset_into_cluster(s, guest_offset));
   1551        assert(*bytes != 0);
   1552
   1553        ret = calculate_l2_meta(bs, cluster_offset, guest_offset,
   1554                                *bytes, l2_slice, m, true);
   1555        if (ret < 0) {
   1556            goto out;
   1557        }
   1558
   1559        ret = 1;
   1560    } else {
   1561        ret = 0;
   1562    }
   1563
   1564    /* Cleanup */
   1565out:
   1566    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
   1567
   1568    /* Only return a host offset if we actually made progress. Otherwise we
   1569     * would make requirements for handle_alloc() that it can't fulfill */
   1570    if (ret > 0) {
   1571        *host_offset = cluster_offset + offset_into_cluster(s, guest_offset);
   1572    }
   1573
   1574    return ret;
   1575}
   1576
   1577/*
   1578 * Allocates new clusters for the given guest_offset.
   1579 *
   1580 * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
   1581 * contain the number of clusters that have been allocated and are contiguous
   1582 * in the image file.
   1583 *
   1584 * If *host_offset is not INV_OFFSET, it specifies the offset in the image file
   1585 * at which the new clusters must start. *nb_clusters can be 0 on return in
   1586 * this case if the cluster at host_offset is already in use. If *host_offset
   1587 * is INV_OFFSET, the clusters can be allocated anywhere in the image file.
   1588 *
   1589 * *host_offset is updated to contain the offset into the image file at which
   1590 * the first allocated cluster starts.
   1591 *
   1592 * Return 0 on success and -errno in error cases. -EAGAIN means that the
   1593 * function has been waiting for another request and the allocation must be
   1594 * restarted, but the whole request should not be failed.
   1595 */
   1596static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
   1597                                   uint64_t *host_offset, uint64_t *nb_clusters)
   1598{
   1599    BDRVQcow2State *s = bs->opaque;
   1600
   1601    trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
   1602                                         *host_offset, *nb_clusters);
   1603
   1604    if (has_data_file(bs)) {
   1605        assert(*host_offset == INV_OFFSET ||
   1606               *host_offset == start_of_cluster(s, guest_offset));
   1607        *host_offset = start_of_cluster(s, guest_offset);
   1608        return 0;
   1609    }
   1610
   1611    /* Allocate new clusters */
   1612    trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
   1613    if (*host_offset == INV_OFFSET) {
   1614        int64_t cluster_offset =
   1615            qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size);
   1616        if (cluster_offset < 0) {
   1617            return cluster_offset;
   1618        }
   1619        *host_offset = cluster_offset;
   1620        return 0;
   1621    } else {
   1622        int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
   1623        if (ret < 0) {
   1624            return ret;
   1625        }
   1626        *nb_clusters = ret;
   1627        return 0;
   1628    }
   1629}
   1630
   1631/*
   1632 * Allocates new clusters for an area that is either still unallocated or
   1633 * cannot be overwritten in-place. If *host_offset is not INV_OFFSET,
   1634 * clusters are only allocated if the new allocation can match the specified
   1635 * host offset.
   1636 *
   1637 * Note that guest_offset may not be cluster aligned. In this case, the
   1638 * returned *host_offset points to exact byte referenced by guest_offset and
   1639 * therefore isn't cluster aligned as well.
   1640 *
   1641 * Returns:
   1642 *   0:     if no clusters could be allocated. *bytes is set to 0,
   1643 *          *host_offset is left unchanged.
   1644 *
   1645 *   1:     if new clusters were allocated. *bytes may be decreased if the
   1646 *          new allocation doesn't cover all of the requested area.
   1647 *          *host_offset is updated to contain the host offset of the first
   1648 *          newly allocated cluster.
   1649 *
   1650 *  -errno: in error cases
   1651 */
   1652static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
   1653    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
   1654{
   1655    BDRVQcow2State *s = bs->opaque;
   1656    int l2_index;
   1657    uint64_t *l2_slice;
   1658    uint64_t nb_clusters;
   1659    int ret;
   1660
   1661    uint64_t alloc_cluster_offset;
   1662
   1663    trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
   1664                             *bytes);
   1665    assert(*bytes > 0);
   1666
   1667    /*
   1668     * Calculate the number of clusters to look for. We stop at L2 slice
   1669     * boundaries to keep things simple.
   1670     */
   1671    nb_clusters =
   1672        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
   1673
   1674    l2_index = offset_to_l2_slice_index(s, guest_offset);
   1675    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
   1676    /* Limit total allocation byte count to BDRV_REQUEST_MAX_BYTES */
   1677    nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits);
   1678
   1679    /* Find L2 entry for the first involved cluster */
   1680    ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
   1681    if (ret < 0) {
   1682        return ret;
   1683    }
   1684
   1685    nb_clusters = count_single_write_clusters(bs, nb_clusters,
   1686                                              l2_slice, l2_index, true);
   1687
   1688    /* This function is only called when there were no non-COW clusters, so if
   1689     * we can't find any unallocated or COW clusters either, something is
   1690     * wrong with our code. */
   1691    assert(nb_clusters > 0);
   1692
   1693    /* Allocate at a given offset in the image file */
   1694    alloc_cluster_offset = *host_offset == INV_OFFSET ? INV_OFFSET :
   1695        start_of_cluster(s, *host_offset);
   1696    ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
   1697                                  &nb_clusters);
   1698    if (ret < 0) {
   1699        goto out;
   1700    }
   1701
   1702    /* Can't extend contiguous allocation */
   1703    if (nb_clusters == 0) {
   1704        *bytes = 0;
   1705        ret = 0;
   1706        goto out;
   1707    }
   1708
   1709    assert(alloc_cluster_offset != INV_OFFSET);
   1710
   1711    /*
   1712     * Save info needed for meta data update.
   1713     *
   1714     * requested_bytes: Number of bytes from the start of the first
   1715     * newly allocated cluster to the end of the (possibly shortened
   1716     * before) write request.
   1717     *
   1718     * avail_bytes: Number of bytes from the start of the first
   1719     * newly allocated to the end of the last newly allocated cluster.
   1720     *
   1721     * nb_bytes: The number of bytes from the start of the first
   1722     * newly allocated cluster to the end of the area that the write
   1723     * request actually writes to (excluding COW at the end)
   1724     */
   1725    uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset);
   1726    int avail_bytes = nb_clusters << s->cluster_bits;
   1727    int nb_bytes = MIN(requested_bytes, avail_bytes);
   1728
   1729    *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
   1730    *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset));
   1731    assert(*bytes != 0);
   1732
   1733    ret = calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes,
   1734                            l2_slice, m, false);
   1735    if (ret < 0) {
   1736        goto out;
   1737    }
   1738
   1739    ret = 1;
   1740
   1741out:
   1742    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
   1743    return ret;
   1744}
   1745
   1746/*
   1747 * For a given area on the virtual disk defined by @offset and @bytes,
   1748 * find the corresponding area on the qcow2 image, allocating new
   1749 * clusters (or subclusters) if necessary. The result can span a
   1750 * combination of allocated and previously unallocated clusters.
   1751 *
   1752 * Note that offset may not be cluster aligned. In this case, the returned
   1753 * *host_offset points to exact byte referenced by offset and therefore
   1754 * isn't cluster aligned as well.
   1755 *
   1756 * On return, @host_offset is set to the beginning of the requested
   1757 * area. This area is guaranteed to be contiguous on the qcow2 file
   1758 * but it can be smaller than initially requested. In this case @bytes
   1759 * is updated with the actual size.
   1760 *
   1761 * If any clusters or subclusters were allocated then @m contains a
   1762 * list with the information of all the affected regions. Note that
   1763 * this can happen regardless of whether this function succeeds or
   1764 * not. The caller is responsible for updating the L2 metadata of the
   1765 * allocated clusters (on success) or freeing them (on failure), and
   1766 * for clearing the contents of @m afterwards in both cases.
   1767 *
   1768 * If the request conflicts with another write request in flight, the coroutine
   1769 * is queued and will be reentered when the dependency has completed.
   1770 *
   1771 * Return 0 on success and -errno in error cases
   1772 */
   1773int qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset,
   1774                            unsigned int *bytes, uint64_t *host_offset,
   1775                            QCowL2Meta **m)
   1776{
   1777    BDRVQcow2State *s = bs->opaque;
   1778    uint64_t start, remaining;
   1779    uint64_t cluster_offset;
   1780    uint64_t cur_bytes;
   1781    int ret;
   1782
   1783    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes);
   1784
   1785again:
   1786    start = offset;
   1787    remaining = *bytes;
   1788    cluster_offset = INV_OFFSET;
   1789    *host_offset = INV_OFFSET;
   1790    cur_bytes = 0;
   1791    *m = NULL;
   1792
   1793    while (true) {
   1794
   1795        if (*host_offset == INV_OFFSET && cluster_offset != INV_OFFSET) {
   1796            *host_offset = cluster_offset;
   1797        }
   1798
   1799        assert(remaining >= cur_bytes);
   1800
   1801        start           += cur_bytes;
   1802        remaining       -= cur_bytes;
   1803
   1804        if (cluster_offset != INV_OFFSET) {
   1805            cluster_offset += cur_bytes;
   1806        }
   1807
   1808        if (remaining == 0) {
   1809            break;
   1810        }
   1811
   1812        cur_bytes = remaining;
   1813
   1814        /*
   1815         * Now start gathering as many contiguous clusters as possible:
   1816         *
   1817         * 1. Check for overlaps with in-flight allocations
   1818         *
   1819         *      a) Overlap not in the first cluster -> shorten this request and
   1820         *         let the caller handle the rest in its next loop iteration.
   1821         *
   1822         *      b) Real overlaps of two requests. Yield and restart the search
   1823         *         for contiguous clusters (the situation could have changed
   1824         *         while we were sleeping)
   1825         *
   1826         *      c) TODO: Request starts in the same cluster as the in-flight
   1827         *         allocation ends. Shorten the COW of the in-fight allocation,
   1828         *         set cluster_offset to write to the same cluster and set up
   1829         *         the right synchronisation between the in-flight request and
   1830         *         the new one.
   1831         */
   1832        ret = handle_dependencies(bs, start, &cur_bytes, m);
   1833        if (ret == -EAGAIN) {
   1834            /* Currently handle_dependencies() doesn't yield if we already had
   1835             * an allocation. If it did, we would have to clean up the L2Meta
   1836             * structs before starting over. */
   1837            assert(*m == NULL);
   1838            goto again;
   1839        } else if (ret < 0) {
   1840            return ret;
   1841        } else if (cur_bytes == 0) {
   1842            break;
   1843        } else {
   1844            /* handle_dependencies() may have decreased cur_bytes (shortened
   1845             * the allocations below) so that the next dependency is processed
   1846             * correctly during the next loop iteration. */
   1847        }
   1848
   1849        /*
   1850         * 2. Count contiguous COPIED clusters.
   1851         */
   1852        ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
   1853        if (ret < 0) {
   1854            return ret;
   1855        } else if (ret) {
   1856            continue;
   1857        } else if (cur_bytes == 0) {
   1858            break;
   1859        }
   1860
   1861        /*
   1862         * 3. If the request still hasn't completed, allocate new clusters,
   1863         *    considering any cluster_offset of steps 1c or 2.
   1864         */
   1865        ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
   1866        if (ret < 0) {
   1867            return ret;
   1868        } else if (ret) {
   1869            continue;
   1870        } else {
   1871            assert(cur_bytes == 0);
   1872            break;
   1873        }
   1874    }
   1875
   1876    *bytes -= remaining;
   1877    assert(*bytes > 0);
   1878    assert(*host_offset != INV_OFFSET);
   1879    assert(offset_into_cluster(s, *host_offset) ==
   1880           offset_into_cluster(s, offset));
   1881
   1882    return 0;
   1883}
   1884
   1885/*
   1886 * This discards as many clusters of nb_clusters as possible at once (i.e.
   1887 * all clusters in the same L2 slice) and returns the number of discarded
   1888 * clusters.
   1889 */
   1890static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
   1891                               uint64_t nb_clusters,
   1892                               enum qcow2_discard_type type, bool full_discard)
   1893{
   1894    BDRVQcow2State *s = bs->opaque;
   1895    uint64_t *l2_slice;
   1896    int l2_index;
   1897    int ret;
   1898    int i;
   1899
   1900    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
   1901    if (ret < 0) {
   1902        return ret;
   1903    }
   1904
   1905    /* Limit nb_clusters to one L2 slice */
   1906    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
   1907    assert(nb_clusters <= INT_MAX);
   1908
   1909    for (i = 0; i < nb_clusters; i++) {
   1910        uint64_t old_l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
   1911        uint64_t old_l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
   1912        uint64_t new_l2_entry = old_l2_entry;
   1913        uint64_t new_l2_bitmap = old_l2_bitmap;
   1914        QCow2ClusterType cluster_type =
   1915            qcow2_get_cluster_type(bs, old_l2_entry);
   1916
   1917        /*
   1918         * If full_discard is true, the cluster should not read back as zeroes,
   1919         * but rather fall through to the backing file.
   1920         *
   1921         * If full_discard is false, make sure that a discarded area reads back
   1922         * as zeroes for v3 images (we cannot do it for v2 without actually
   1923         * writing a zero-filled buffer). We can skip the operation if the
   1924         * cluster is already marked as zero, or if it's unallocated and we
   1925         * don't have a backing file.
   1926         *
   1927         * TODO We might want to use bdrv_block_status(bs) here, but we're
   1928         * holding s->lock, so that doesn't work today.
   1929         */
   1930        if (full_discard) {
   1931            new_l2_entry = new_l2_bitmap = 0;
   1932        } else if (bs->backing || qcow2_cluster_is_allocated(cluster_type)) {
   1933            if (has_subclusters(s)) {
   1934                new_l2_entry = 0;
   1935                new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
   1936            } else {
   1937                new_l2_entry = s->qcow_version >= 3 ? QCOW_OFLAG_ZERO : 0;
   1938            }
   1939        }
   1940
   1941        if (old_l2_entry == new_l2_entry && old_l2_bitmap == new_l2_bitmap) {
   1942            continue;
   1943        }
   1944
   1945        /* First remove L2 entries */
   1946        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
   1947        set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry);
   1948        if (has_subclusters(s)) {
   1949            set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
   1950        }
   1951        /* Then decrease the refcount */
   1952        qcow2_free_any_cluster(bs, old_l2_entry, type);
   1953    }
   1954
   1955    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
   1956
   1957    return nb_clusters;
   1958}
   1959
   1960int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
   1961                          uint64_t bytes, enum qcow2_discard_type type,
   1962                          bool full_discard)
   1963{
   1964    BDRVQcow2State *s = bs->opaque;
   1965    uint64_t end_offset = offset + bytes;
   1966    uint64_t nb_clusters;
   1967    int64_t cleared;
   1968    int ret;
   1969
   1970    /* Caller must pass aligned values, except at image end */
   1971    assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
   1972    assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
   1973           end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
   1974
   1975    nb_clusters = size_to_clusters(s, bytes);
   1976
   1977    s->cache_discards = true;
   1978
   1979    /* Each L2 slice is handled by its own loop iteration */
   1980    while (nb_clusters > 0) {
   1981        cleared = discard_in_l2_slice(bs, offset, nb_clusters, type,
   1982                                      full_discard);
   1983        if (cleared < 0) {
   1984            ret = cleared;
   1985            goto fail;
   1986        }
   1987
   1988        nb_clusters -= cleared;
   1989        offset += (cleared * s->cluster_size);
   1990    }
   1991
   1992    ret = 0;
   1993fail:
   1994    s->cache_discards = false;
   1995    qcow2_process_discards(bs, ret);
   1996
   1997    return ret;
   1998}
   1999
   2000/*
   2001 * This zeroes as many clusters of nb_clusters as possible at once (i.e.
   2002 * all clusters in the same L2 slice) and returns the number of zeroed
   2003 * clusters.
   2004 */
   2005static int zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
   2006                            uint64_t nb_clusters, int flags)
   2007{
   2008    BDRVQcow2State *s = bs->opaque;
   2009    uint64_t *l2_slice;
   2010    int l2_index;
   2011    int ret;
   2012    int i;
   2013
   2014    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
   2015    if (ret < 0) {
   2016        return ret;
   2017    }
   2018
   2019    /* Limit nb_clusters to one L2 slice */
   2020    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
   2021    assert(nb_clusters <= INT_MAX);
   2022
   2023    for (i = 0; i < nb_clusters; i++) {
   2024        uint64_t old_l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
   2025        uint64_t old_l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
   2026        QCow2ClusterType type = qcow2_get_cluster_type(bs, old_l2_entry);
   2027        bool unmap = (type == QCOW2_CLUSTER_COMPRESSED) ||
   2028            ((flags & BDRV_REQ_MAY_UNMAP) && qcow2_cluster_is_allocated(type));
   2029        uint64_t new_l2_entry = unmap ? 0 : old_l2_entry;
   2030        uint64_t new_l2_bitmap = old_l2_bitmap;
   2031
   2032        if (has_subclusters(s)) {
   2033            new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
   2034        } else {
   2035            new_l2_entry |= QCOW_OFLAG_ZERO;
   2036        }
   2037
   2038        if (old_l2_entry == new_l2_entry && old_l2_bitmap == new_l2_bitmap) {
   2039            continue;
   2040        }
   2041
   2042        /* First update L2 entries */
   2043        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
   2044        set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry);
   2045        if (has_subclusters(s)) {
   2046            set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
   2047        }
   2048
   2049        /* Then decrease the refcount */
   2050        if (unmap) {
   2051            qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST);
   2052        }
   2053    }
   2054
   2055    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
   2056
   2057    return nb_clusters;
   2058}
   2059
   2060static int zero_l2_subclusters(BlockDriverState *bs, uint64_t offset,
   2061                               unsigned nb_subclusters)
   2062{
   2063    BDRVQcow2State *s = bs->opaque;
   2064    uint64_t *l2_slice;
   2065    uint64_t old_l2_bitmap, l2_bitmap;
   2066    int l2_index, ret, sc = offset_to_sc_index(s, offset);
   2067
   2068    /* For full clusters use zero_in_l2_slice() instead */
   2069    assert(nb_subclusters > 0 && nb_subclusters < s->subclusters_per_cluster);
   2070    assert(sc + nb_subclusters <= s->subclusters_per_cluster);
   2071    assert(offset_into_subcluster(s, offset) == 0);
   2072
   2073    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
   2074    if (ret < 0) {
   2075        return ret;
   2076    }
   2077
   2078    switch (qcow2_get_cluster_type(bs, get_l2_entry(s, l2_slice, l2_index))) {
   2079    case QCOW2_CLUSTER_COMPRESSED:
   2080        ret = -ENOTSUP; /* We cannot partially zeroize compressed clusters */
   2081        goto out;
   2082    case QCOW2_CLUSTER_NORMAL:
   2083    case QCOW2_CLUSTER_UNALLOCATED:
   2084        break;
   2085    default:
   2086        g_assert_not_reached();
   2087    }
   2088
   2089    old_l2_bitmap = l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
   2090
   2091    l2_bitmap |=  QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
   2092    l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
   2093
   2094    if (old_l2_bitmap != l2_bitmap) {
   2095        set_l2_bitmap(s, l2_slice, l2_index, l2_bitmap);
   2096        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
   2097    }
   2098
   2099    ret = 0;
   2100out:
   2101    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
   2102
   2103    return ret;
   2104}
   2105
   2106int qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
   2107                             uint64_t bytes, int flags)
   2108{
   2109    BDRVQcow2State *s = bs->opaque;
   2110    uint64_t end_offset = offset + bytes;
   2111    uint64_t nb_clusters;
   2112    unsigned head, tail;
   2113    int64_t cleared;
   2114    int ret;
   2115
   2116    /* If we have to stay in sync with an external data file, zero out
   2117     * s->data_file first. */
   2118    if (data_file_is_raw(bs)) {
   2119        assert(has_data_file(bs));
   2120        ret = bdrv_co_pwrite_zeroes(s->data_file, offset, bytes, flags);
   2121        if (ret < 0) {
   2122            return ret;
   2123        }
   2124    }
   2125
   2126    /* Caller must pass aligned values, except at image end */
   2127    assert(offset_into_subcluster(s, offset) == 0);
   2128    assert(offset_into_subcluster(s, end_offset) == 0 ||
   2129           end_offset >= bs->total_sectors << BDRV_SECTOR_BITS);
   2130
   2131    /*
   2132     * The zero flag is only supported by version 3 and newer. However, if we
   2133     * have no backing file, we can resort to discard in version 2.
   2134     */
   2135    if (s->qcow_version < 3) {
   2136        if (!bs->backing) {
   2137            return qcow2_cluster_discard(bs, offset, bytes,
   2138                                         QCOW2_DISCARD_REQUEST, false);
   2139        }
   2140        return -ENOTSUP;
   2141    }
   2142
   2143    head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset;
   2144    offset += head;
   2145
   2146    tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 :
   2147        end_offset - MAX(offset, start_of_cluster(s, end_offset));
   2148    end_offset -= tail;
   2149
   2150    s->cache_discards = true;
   2151
   2152    if (head) {
   2153        ret = zero_l2_subclusters(bs, offset - head,
   2154                                  size_to_subclusters(s, head));
   2155        if (ret < 0) {
   2156            goto fail;
   2157        }
   2158    }
   2159
   2160    /* Each L2 slice is handled by its own loop iteration */
   2161    nb_clusters = size_to_clusters(s, end_offset - offset);
   2162
   2163    while (nb_clusters > 0) {
   2164        cleared = zero_in_l2_slice(bs, offset, nb_clusters, flags);
   2165        if (cleared < 0) {
   2166            ret = cleared;
   2167            goto fail;
   2168        }
   2169
   2170        nb_clusters -= cleared;
   2171        offset += (cleared * s->cluster_size);
   2172    }
   2173
   2174    if (tail) {
   2175        ret = zero_l2_subclusters(bs, end_offset, size_to_subclusters(s, tail));
   2176        if (ret < 0) {
   2177            goto fail;
   2178        }
   2179    }
   2180
   2181    ret = 0;
   2182fail:
   2183    s->cache_discards = false;
   2184    qcow2_process_discards(bs, ret);
   2185
   2186    return ret;
   2187}
   2188
   2189/*
   2190 * Expands all zero clusters in a specific L1 table (or deallocates them, for
   2191 * non-backed non-pre-allocated zero clusters).
   2192 *
   2193 * l1_entries and *visited_l1_entries are used to keep track of progress for
   2194 * status_cb(). l1_entries contains the total number of L1 entries and
   2195 * *visited_l1_entries counts all visited L1 entries.
   2196 */
   2197static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
   2198                                      int l1_size, int64_t *visited_l1_entries,
   2199                                      int64_t l1_entries,
   2200                                      BlockDriverAmendStatusCB *status_cb,
   2201                                      void *cb_opaque)
   2202{
   2203    BDRVQcow2State *s = bs->opaque;
   2204    bool is_active_l1 = (l1_table == s->l1_table);
   2205    uint64_t *l2_slice = NULL;
   2206    unsigned slice, slice_size2, n_slices;
   2207    int ret;
   2208    int i, j;
   2209
   2210    /* qcow2_downgrade() is not allowed in images with subclusters */
   2211    assert(!has_subclusters(s));
   2212
   2213    slice_size2 = s->l2_slice_size * l2_entry_size(s);
   2214    n_slices = s->cluster_size / slice_size2;
   2215
   2216    if (!is_active_l1) {
   2217        /* inactive L2 tables require a buffer to be stored in when loading
   2218         * them from disk */
   2219        l2_slice = qemu_try_blockalign(bs->file->bs, slice_size2);
   2220        if (l2_slice == NULL) {
   2221            return -ENOMEM;
   2222        }
   2223    }
   2224
   2225    for (i = 0; i < l1_size; i++) {
   2226        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
   2227        uint64_t l2_refcount;
   2228
   2229        if (!l2_offset) {
   2230            /* unallocated */
   2231            (*visited_l1_entries)++;
   2232            if (status_cb) {
   2233                status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
   2234            }
   2235            continue;
   2236        }
   2237
   2238        if (offset_into_cluster(s, l2_offset)) {
   2239            qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
   2240                                    PRIx64 " unaligned (L1 index: %#x)",
   2241                                    l2_offset, i);
   2242            ret = -EIO;
   2243            goto fail;
   2244        }
   2245
   2246        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
   2247                                 &l2_refcount);
   2248        if (ret < 0) {
   2249            goto fail;
   2250        }
   2251
   2252        for (slice = 0; slice < n_slices; slice++) {
   2253            uint64_t slice_offset = l2_offset + slice * slice_size2;
   2254            bool l2_dirty = false;
   2255            if (is_active_l1) {
   2256                /* get active L2 tables from cache */
   2257                ret = qcow2_cache_get(bs, s->l2_table_cache, slice_offset,
   2258                                      (void **)&l2_slice);
   2259            } else {
   2260                /* load inactive L2 tables from disk */
   2261                ret = bdrv_pread(bs->file, slice_offset, l2_slice, slice_size2);
   2262            }
   2263            if (ret < 0) {
   2264                goto fail;
   2265            }
   2266
   2267            for (j = 0; j < s->l2_slice_size; j++) {
   2268                uint64_t l2_entry = get_l2_entry(s, l2_slice, j);
   2269                int64_t offset = l2_entry & L2E_OFFSET_MASK;
   2270                QCow2ClusterType cluster_type =
   2271                    qcow2_get_cluster_type(bs, l2_entry);
   2272
   2273                if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN &&
   2274                    cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) {
   2275                    continue;
   2276                }
   2277
   2278                if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
   2279                    if (!bs->backing) {
   2280                        /*
   2281                         * not backed; therefore we can simply deallocate the
   2282                         * cluster. No need to call set_l2_bitmap(), this
   2283                         * function doesn't support images with subclusters.
   2284                         */
   2285                        set_l2_entry(s, l2_slice, j, 0);
   2286                        l2_dirty = true;
   2287                        continue;
   2288                    }
   2289
   2290                    offset = qcow2_alloc_clusters(bs, s->cluster_size);
   2291                    if (offset < 0) {
   2292                        ret = offset;
   2293                        goto fail;
   2294                    }
   2295
   2296                    /* The offset must fit in the offset field */
   2297                    assert((offset & L2E_OFFSET_MASK) == offset);
   2298
   2299                    if (l2_refcount > 1) {
   2300                        /* For shared L2 tables, set the refcount accordingly
   2301                         * (it is already 1 and needs to be l2_refcount) */
   2302                        ret = qcow2_update_cluster_refcount(
   2303                            bs, offset >> s->cluster_bits,
   2304                            refcount_diff(1, l2_refcount), false,
   2305                            QCOW2_DISCARD_OTHER);
   2306                        if (ret < 0) {
   2307                            qcow2_free_clusters(bs, offset, s->cluster_size,
   2308                                                QCOW2_DISCARD_OTHER);
   2309                            goto fail;
   2310                        }
   2311                    }
   2312                }
   2313
   2314                if (offset_into_cluster(s, offset)) {
   2315                    int l2_index = slice * s->l2_slice_size + j;
   2316                    qcow2_signal_corruption(
   2317                        bs, true, -1, -1,
   2318                        "Cluster allocation offset "
   2319                        "%#" PRIx64 " unaligned (L2 offset: %#"
   2320                        PRIx64 ", L2 index: %#x)", offset,
   2321                        l2_offset, l2_index);
   2322                    if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
   2323                        qcow2_free_clusters(bs, offset, s->cluster_size,
   2324                                            QCOW2_DISCARD_ALWAYS);
   2325                    }
   2326                    ret = -EIO;
   2327                    goto fail;
   2328                }
   2329
   2330                ret = qcow2_pre_write_overlap_check(bs, 0, offset,
   2331                                                    s->cluster_size, true);
   2332                if (ret < 0) {
   2333                    if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
   2334                        qcow2_free_clusters(bs, offset, s->cluster_size,
   2335                                            QCOW2_DISCARD_ALWAYS);
   2336                    }
   2337                    goto fail;
   2338                }
   2339
   2340                ret = bdrv_pwrite_zeroes(s->data_file, offset,
   2341                                         s->cluster_size, 0);
   2342                if (ret < 0) {
   2343                    if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
   2344                        qcow2_free_clusters(bs, offset, s->cluster_size,
   2345                                            QCOW2_DISCARD_ALWAYS);
   2346                    }
   2347                    goto fail;
   2348                }
   2349
   2350                if (l2_refcount == 1) {
   2351                    set_l2_entry(s, l2_slice, j, offset | QCOW_OFLAG_COPIED);
   2352                } else {
   2353                    set_l2_entry(s, l2_slice, j, offset);
   2354                }
   2355                /*
   2356                 * No need to call set_l2_bitmap() after set_l2_entry() because
   2357                 * this function doesn't support images with subclusters.
   2358                 */
   2359                l2_dirty = true;
   2360            }
   2361
   2362            if (is_active_l1) {
   2363                if (l2_dirty) {
   2364                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
   2365                    qcow2_cache_depends_on_flush(s->l2_table_cache);
   2366                }
   2367                qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
   2368            } else {
   2369                if (l2_dirty) {
   2370                    ret = qcow2_pre_write_overlap_check(
   2371                        bs, QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2,
   2372                        slice_offset, slice_size2, false);
   2373                    if (ret < 0) {
   2374                        goto fail;
   2375                    }
   2376
   2377                    ret = bdrv_pwrite(bs->file, slice_offset,
   2378                                      l2_slice, slice_size2);
   2379                    if (ret < 0) {
   2380                        goto fail;
   2381                    }
   2382                }
   2383            }
   2384        }
   2385
   2386        (*visited_l1_entries)++;
   2387        if (status_cb) {
   2388            status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
   2389        }
   2390    }
   2391
   2392    ret = 0;
   2393
   2394fail:
   2395    if (l2_slice) {
   2396        if (!is_active_l1) {
   2397            qemu_vfree(l2_slice);
   2398        } else {
   2399            qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
   2400        }
   2401    }
   2402    return ret;
   2403}
   2404
   2405/*
   2406 * For backed images, expands all zero clusters on the image. For non-backed
   2407 * images, deallocates all non-pre-allocated zero clusters (and claims the
   2408 * allocation for pre-allocated ones). This is important for downgrading to a
   2409 * qcow2 version which doesn't yet support metadata zero clusters.
   2410 */
   2411int qcow2_expand_zero_clusters(BlockDriverState *bs,
   2412                               BlockDriverAmendStatusCB *status_cb,
   2413                               void *cb_opaque)
   2414{
   2415    BDRVQcow2State *s = bs->opaque;
   2416    uint64_t *l1_table = NULL;
   2417    int64_t l1_entries = 0, visited_l1_entries = 0;
   2418    int ret;
   2419    int i, j;
   2420
   2421    if (status_cb) {
   2422        l1_entries = s->l1_size;
   2423        for (i = 0; i < s->nb_snapshots; i++) {
   2424            l1_entries += s->snapshots[i].l1_size;
   2425        }
   2426    }
   2427
   2428    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
   2429                                     &visited_l1_entries, l1_entries,
   2430                                     status_cb, cb_opaque);
   2431    if (ret < 0) {
   2432        goto fail;
   2433    }
   2434
   2435    /* Inactive L1 tables may point to active L2 tables - therefore it is
   2436     * necessary to flush the L2 table cache before trying to access the L2
   2437     * tables pointed to by inactive L1 entries (else we might try to expand
   2438     * zero clusters that have already been expanded); furthermore, it is also
   2439     * necessary to empty the L2 table cache, since it may contain tables which
   2440     * are now going to be modified directly on disk, bypassing the cache.
   2441     * qcow2_cache_empty() does both for us. */
   2442    ret = qcow2_cache_empty(bs, s->l2_table_cache);
   2443    if (ret < 0) {
   2444        goto fail;
   2445    }
   2446
   2447    for (i = 0; i < s->nb_snapshots; i++) {
   2448        int l1_size2;
   2449        uint64_t *new_l1_table;
   2450        Error *local_err = NULL;
   2451
   2452        ret = qcow2_validate_table(bs, s->snapshots[i].l1_table_offset,
   2453                                   s->snapshots[i].l1_size, L1E_SIZE,
   2454                                   QCOW_MAX_L1_SIZE, "Snapshot L1 table",
   2455                                   &local_err);
   2456        if (ret < 0) {
   2457            error_report_err(local_err);
   2458            goto fail;
   2459        }
   2460
   2461        l1_size2 = s->snapshots[i].l1_size * L1E_SIZE;
   2462        new_l1_table = g_try_realloc(l1_table, l1_size2);
   2463
   2464        if (!new_l1_table) {
   2465            ret = -ENOMEM;
   2466            goto fail;
   2467        }
   2468
   2469        l1_table = new_l1_table;
   2470
   2471        ret = bdrv_pread(bs->file, s->snapshots[i].l1_table_offset,
   2472                         l1_table, l1_size2);
   2473        if (ret < 0) {
   2474            goto fail;
   2475        }
   2476
   2477        for (j = 0; j < s->snapshots[i].l1_size; j++) {
   2478            be64_to_cpus(&l1_table[j]);
   2479        }
   2480
   2481        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
   2482                                         &visited_l1_entries, l1_entries,
   2483                                         status_cb, cb_opaque);
   2484        if (ret < 0) {
   2485            goto fail;
   2486        }
   2487    }
   2488
   2489    ret = 0;
   2490
   2491fail:
   2492    g_free(l1_table);
   2493    return ret;
   2494}
   2495
   2496void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry,
   2497                                     uint64_t *coffset, int *csize)
   2498{
   2499    BDRVQcow2State *s = bs->opaque;
   2500    int nb_csectors;
   2501
   2502    assert(qcow2_get_cluster_type(bs, l2_entry) == QCOW2_CLUSTER_COMPRESSED);
   2503
   2504    *coffset = l2_entry & s->cluster_offset_mask;
   2505
   2506    nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1;
   2507    *csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE -
   2508        (*coffset & (QCOW2_COMPRESSED_SECTOR_SIZE - 1));
   2509}