cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

block.c (237469B)


      1/*
      2 * QEMU System Emulator block driver
      3 *
      4 * Copyright (c) 2003 Fabrice Bellard
      5 * Copyright (c) 2020 Virtuozzo International GmbH.
      6 *
      7 * Permission is hereby granted, free of charge, to any person obtaining a copy
      8 * of this software and associated documentation files (the "Software"), to deal
      9 * in the Software without restriction, including without limitation the rights
     10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     11 * copies of the Software, and to permit persons to whom the Software is
     12 * furnished to do so, subject to the following conditions:
     13 *
     14 * The above copyright notice and this permission notice shall be included in
     15 * all copies or substantial portions of the Software.
     16 *
     17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     23 * THE SOFTWARE.
     24 */
     25
     26#include "qemu/osdep.h"
     27#include "block/trace.h"
     28#include "block/block_int.h"
     29#include "block/blockjob.h"
     30#include "block/fuse.h"
     31#include "block/nbd.h"
     32#include "block/qdict.h"
     33#include "qemu/error-report.h"
     34#include "block/module_block.h"
     35#include "qemu/main-loop.h"
     36#include "qemu/module.h"
     37#include "qapi/error.h"
     38#include "qapi/qmp/qdict.h"
     39#include "qapi/qmp/qjson.h"
     40#include "qapi/qmp/qnull.h"
     41#include "qapi/qmp/qstring.h"
     42#include "qapi/qobject-output-visitor.h"
     43#include "qapi/qapi-visit-block-core.h"
     44#include "sysemu/block-backend.h"
     45#include "qemu/notify.h"
     46#include "qemu/option.h"
     47#include "qemu/coroutine.h"
     48#include "block/qapi.h"
     49#include "qemu/timer.h"
     50#include "qemu/cutils.h"
     51#include "qemu/id.h"
     52#include "qemu/range.h"
     53#include "qemu/rcu.h"
     54#include "block/coroutines.h"
     55
     56#ifdef CONFIG_BSD
     57#include <sys/ioctl.h>
     58#include <sys/queue.h>
     59#if defined(HAVE_SYS_DISK_H)
     60#include <sys/disk.h>
     61#endif
     62#endif
     63
     64#ifdef _WIN32
     65#include <windows.h>
     66#endif
     67
     68#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
     69
     70static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
     71    QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
     72
     73static QTAILQ_HEAD(, BlockDriverState) all_bdrv_states =
     74    QTAILQ_HEAD_INITIALIZER(all_bdrv_states);
     75
     76static QLIST_HEAD(, BlockDriver) bdrv_drivers =
     77    QLIST_HEAD_INITIALIZER(bdrv_drivers);
     78
     79static BlockDriverState *bdrv_open_inherit(const char *filename,
     80                                           const char *reference,
     81                                           QDict *options, int flags,
     82                                           BlockDriverState *parent,
     83                                           const BdrvChildClass *child_class,
     84                                           BdrvChildRole child_role,
     85                                           Error **errp);
     86
     87static void bdrv_replace_child_noperm(BdrvChild *child,
     88                                      BlockDriverState *new_bs);
     89static void bdrv_remove_file_or_backing_child(BlockDriverState *bs,
     90                                              BdrvChild *child,
     91                                              Transaction *tran);
     92static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs,
     93                                            Transaction *tran);
     94
     95static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
     96                               BlockReopenQueue *queue,
     97                               Transaction *change_child_tran, Error **errp);
     98static void bdrv_reopen_commit(BDRVReopenState *reopen_state);
     99static void bdrv_reopen_abort(BDRVReopenState *reopen_state);
    100
    101/* If non-zero, use only whitelisted block drivers */
    102static int use_bdrv_whitelist;
    103
    104#ifdef _WIN32
    105static int is_windows_drive_prefix(const char *filename)
    106{
    107    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
    108             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
    109            filename[1] == ':');
    110}
    111
    112int is_windows_drive(const char *filename)
    113{
    114    if (is_windows_drive_prefix(filename) &&
    115        filename[2] == '\0')
    116        return 1;
    117    if (strstart(filename, "\\\\.\\", NULL) ||
    118        strstart(filename, "//./", NULL))
    119        return 1;
    120    return 0;
    121}
    122#endif
    123
    124size_t bdrv_opt_mem_align(BlockDriverState *bs)
    125{
    126    if (!bs || !bs->drv) {
    127        /* page size or 4k (hdd sector size) should be on the safe side */
    128        return MAX(4096, qemu_real_host_page_size);
    129    }
    130
    131    return bs->bl.opt_mem_alignment;
    132}
    133
    134size_t bdrv_min_mem_align(BlockDriverState *bs)
    135{
    136    if (!bs || !bs->drv) {
    137        /* page size or 4k (hdd sector size) should be on the safe side */
    138        return MAX(4096, qemu_real_host_page_size);
    139    }
    140
    141    return bs->bl.min_mem_alignment;
    142}
    143
    144/* check if the path starts with "<protocol>:" */
    145int path_has_protocol(const char *path)
    146{
    147    const char *p;
    148
    149#ifdef _WIN32
    150    if (is_windows_drive(path) ||
    151        is_windows_drive_prefix(path)) {
    152        return 0;
    153    }
    154    p = path + strcspn(path, ":/\\");
    155#else
    156    p = path + strcspn(path, ":/");
    157#endif
    158
    159    return *p == ':';
    160}
    161
    162int path_is_absolute(const char *path)
    163{
    164#ifdef _WIN32
    165    /* specific case for names like: "\\.\d:" */
    166    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
    167        return 1;
    168    }
    169    return (*path == '/' || *path == '\\');
    170#else
    171    return (*path == '/');
    172#endif
    173}
    174
    175/* if filename is absolute, just return its duplicate. Otherwise, build a
    176   path to it by considering it is relative to base_path. URL are
    177   supported. */
    178char *path_combine(const char *base_path, const char *filename)
    179{
    180    const char *protocol_stripped = NULL;
    181    const char *p, *p1;
    182    char *result;
    183    int len;
    184
    185    if (path_is_absolute(filename)) {
    186        return g_strdup(filename);
    187    }
    188
    189    if (path_has_protocol(base_path)) {
    190        protocol_stripped = strchr(base_path, ':');
    191        if (protocol_stripped) {
    192            protocol_stripped++;
    193        }
    194    }
    195    p = protocol_stripped ?: base_path;
    196
    197    p1 = strrchr(base_path, '/');
    198#ifdef _WIN32
    199    {
    200        const char *p2;
    201        p2 = strrchr(base_path, '\\');
    202        if (!p1 || p2 > p1) {
    203            p1 = p2;
    204        }
    205    }
    206#endif
    207    if (p1) {
    208        p1++;
    209    } else {
    210        p1 = base_path;
    211    }
    212    if (p1 > p) {
    213        p = p1;
    214    }
    215    len = p - base_path;
    216
    217    result = g_malloc(len + strlen(filename) + 1);
    218    memcpy(result, base_path, len);
    219    strcpy(result + len, filename);
    220
    221    return result;
    222}
    223
    224/*
    225 * Helper function for bdrv_parse_filename() implementations to remove optional
    226 * protocol prefixes (especially "file:") from a filename and for putting the
    227 * stripped filename into the options QDict if there is such a prefix.
    228 */
    229void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix,
    230                                      QDict *options)
    231{
    232    if (strstart(filename, prefix, &filename)) {
    233        /* Stripping the explicit protocol prefix may result in a protocol
    234         * prefix being (wrongly) detected (if the filename contains a colon) */
    235        if (path_has_protocol(filename)) {
    236            GString *fat_filename;
    237
    238            /* This means there is some colon before the first slash; therefore,
    239             * this cannot be an absolute path */
    240            assert(!path_is_absolute(filename));
    241
    242            /* And we can thus fix the protocol detection issue by prefixing it
    243             * by "./" */
    244            fat_filename = g_string_new("./");
    245            g_string_append(fat_filename, filename);
    246
    247            assert(!path_has_protocol(fat_filename->str));
    248
    249            qdict_put(options, "filename",
    250                      qstring_from_gstring(fat_filename));
    251        } else {
    252            /* If no protocol prefix was detected, we can use the shortened
    253             * filename as-is */
    254            qdict_put_str(options, "filename", filename);
    255        }
    256    }
    257}
    258
    259
    260/* Returns whether the image file is opened as read-only. Note that this can
    261 * return false and writing to the image file is still not possible because the
    262 * image is inactivated. */
    263bool bdrv_is_read_only(BlockDriverState *bs)
    264{
    265    return !(bs->open_flags & BDRV_O_RDWR);
    266}
    267
    268int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
    269                           bool ignore_allow_rdw, Error **errp)
    270{
    271    /* Do not set read_only if copy_on_read is enabled */
    272    if (bs->copy_on_read && read_only) {
    273        error_setg(errp, "Can't set node '%s' to r/o with copy-on-read enabled",
    274                   bdrv_get_device_or_node_name(bs));
    275        return -EINVAL;
    276    }
    277
    278    /* Do not clear read_only if it is prohibited */
    279    if (!read_only && !(bs->open_flags & BDRV_O_ALLOW_RDWR) &&
    280        !ignore_allow_rdw)
    281    {
    282        error_setg(errp, "Node '%s' is read only",
    283                   bdrv_get_device_or_node_name(bs));
    284        return -EPERM;
    285    }
    286
    287    return 0;
    288}
    289
    290/*
    291 * Called by a driver that can only provide a read-only image.
    292 *
    293 * Returns 0 if the node is already read-only or it could switch the node to
    294 * read-only because BDRV_O_AUTO_RDONLY is set.
    295 *
    296 * Returns -EACCES if the node is read-write and BDRV_O_AUTO_RDONLY is not set
    297 * or bdrv_can_set_read_only() forbids making the node read-only. If @errmsg
    298 * is not NULL, it is used as the error message for the Error object.
    299 */
    300int bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg,
    301                              Error **errp)
    302{
    303    int ret = 0;
    304
    305    if (!(bs->open_flags & BDRV_O_RDWR)) {
    306        return 0;
    307    }
    308    if (!(bs->open_flags & BDRV_O_AUTO_RDONLY)) {
    309        goto fail;
    310    }
    311
    312    ret = bdrv_can_set_read_only(bs, true, false, NULL);
    313    if (ret < 0) {
    314        goto fail;
    315    }
    316
    317    bs->open_flags &= ~BDRV_O_RDWR;
    318
    319    return 0;
    320
    321fail:
    322    error_setg(errp, "%s", errmsg ?: "Image is read-only");
    323    return -EACCES;
    324}
    325
    326/*
    327 * If @backing is empty, this function returns NULL without setting
    328 * @errp.  In all other cases, NULL will only be returned with @errp
    329 * set.
    330 *
    331 * Therefore, a return value of NULL without @errp set means that
    332 * there is no backing file; if @errp is set, there is one but its
    333 * absolute filename cannot be generated.
    334 */
    335char *bdrv_get_full_backing_filename_from_filename(const char *backed,
    336                                                   const char *backing,
    337                                                   Error **errp)
    338{
    339    if (backing[0] == '\0') {
    340        return NULL;
    341    } else if (path_has_protocol(backing) || path_is_absolute(backing)) {
    342        return g_strdup(backing);
    343    } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
    344        error_setg(errp, "Cannot use relative backing file names for '%s'",
    345                   backed);
    346        return NULL;
    347    } else {
    348        return path_combine(backed, backing);
    349    }
    350}
    351
    352/*
    353 * If @filename is empty or NULL, this function returns NULL without
    354 * setting @errp.  In all other cases, NULL will only be returned with
    355 * @errp set.
    356 */
    357static char *bdrv_make_absolute_filename(BlockDriverState *relative_to,
    358                                         const char *filename, Error **errp)
    359{
    360    char *dir, *full_name;
    361
    362    if (!filename || filename[0] == '\0') {
    363        return NULL;
    364    } else if (path_has_protocol(filename) || path_is_absolute(filename)) {
    365        return g_strdup(filename);
    366    }
    367
    368    dir = bdrv_dirname(relative_to, errp);
    369    if (!dir) {
    370        return NULL;
    371    }
    372
    373    full_name = g_strconcat(dir, filename, NULL);
    374    g_free(dir);
    375    return full_name;
    376}
    377
    378char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp)
    379{
    380    return bdrv_make_absolute_filename(bs, bs->backing_file, errp);
    381}
    382
    383void bdrv_register(BlockDriver *bdrv)
    384{
    385    assert(bdrv->format_name);
    386    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
    387}
    388
    389BlockDriverState *bdrv_new(void)
    390{
    391    BlockDriverState *bs;
    392    int i;
    393
    394    bs = g_new0(BlockDriverState, 1);
    395    QLIST_INIT(&bs->dirty_bitmaps);
    396    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
    397        QLIST_INIT(&bs->op_blockers[i]);
    398    }
    399    qemu_co_mutex_init(&bs->reqs_lock);
    400    qemu_mutex_init(&bs->dirty_bitmap_mutex);
    401    bs->refcnt = 1;
    402    bs->aio_context = qemu_get_aio_context();
    403
    404    qemu_co_queue_init(&bs->flush_queue);
    405
    406    qemu_co_mutex_init(&bs->bsc_modify_lock);
    407    bs->block_status_cache = g_new0(BdrvBlockStatusCache, 1);
    408
    409    for (i = 0; i < bdrv_drain_all_count; i++) {
    410        bdrv_drained_begin(bs);
    411    }
    412
    413    QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
    414
    415    return bs;
    416}
    417
    418static BlockDriver *bdrv_do_find_format(const char *format_name)
    419{
    420    BlockDriver *drv1;
    421
    422    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
    423        if (!strcmp(drv1->format_name, format_name)) {
    424            return drv1;
    425        }
    426    }
    427
    428    return NULL;
    429}
    430
    431BlockDriver *bdrv_find_format(const char *format_name)
    432{
    433    BlockDriver *drv1;
    434    int i;
    435
    436    drv1 = bdrv_do_find_format(format_name);
    437    if (drv1) {
    438        return drv1;
    439    }
    440
    441    /* The driver isn't registered, maybe we need to load a module */
    442    for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); ++i) {
    443        if (!strcmp(block_driver_modules[i].format_name, format_name)) {
    444            block_module_load_one(block_driver_modules[i].library_name);
    445            break;
    446        }
    447    }
    448
    449    return bdrv_do_find_format(format_name);
    450}
    451
    452static int bdrv_format_is_whitelisted(const char *format_name, bool read_only)
    453{
    454    static const char *whitelist_rw[] = {
    455        CONFIG_BDRV_RW_WHITELIST
    456        NULL
    457    };
    458    static const char *whitelist_ro[] = {
    459        CONFIG_BDRV_RO_WHITELIST
    460        NULL
    461    };
    462    const char **p;
    463
    464    if (!whitelist_rw[0] && !whitelist_ro[0]) {
    465        return 1;               /* no whitelist, anything goes */
    466    }
    467
    468    for (p = whitelist_rw; *p; p++) {
    469        if (!strcmp(format_name, *p)) {
    470            return 1;
    471        }
    472    }
    473    if (read_only) {
    474        for (p = whitelist_ro; *p; p++) {
    475            if (!strcmp(format_name, *p)) {
    476                return 1;
    477            }
    478        }
    479    }
    480    return 0;
    481}
    482
    483int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
    484{
    485    return bdrv_format_is_whitelisted(drv->format_name, read_only);
    486}
    487
    488bool bdrv_uses_whitelist(void)
    489{
    490    return use_bdrv_whitelist;
    491}
    492
    493typedef struct CreateCo {
    494    BlockDriver *drv;
    495    char *filename;
    496    QemuOpts *opts;
    497    int ret;
    498    Error *err;
    499} CreateCo;
    500
    501static void coroutine_fn bdrv_create_co_entry(void *opaque)
    502{
    503    Error *local_err = NULL;
    504    int ret;
    505
    506    CreateCo *cco = opaque;
    507    assert(cco->drv);
    508
    509    ret = cco->drv->bdrv_co_create_opts(cco->drv,
    510                                        cco->filename, cco->opts, &local_err);
    511    error_propagate(&cco->err, local_err);
    512    cco->ret = ret;
    513}
    514
    515int bdrv_create(BlockDriver *drv, const char* filename,
    516                QemuOpts *opts, Error **errp)
    517{
    518    int ret;
    519
    520    Coroutine *co;
    521    CreateCo cco = {
    522        .drv = drv,
    523        .filename = g_strdup(filename),
    524        .opts = opts,
    525        .ret = NOT_DONE,
    526        .err = NULL,
    527    };
    528
    529    if (!drv->bdrv_co_create_opts) {
    530        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
    531        ret = -ENOTSUP;
    532        goto out;
    533    }
    534
    535    if (qemu_in_coroutine()) {
    536        /* Fast-path if already in coroutine context */
    537        bdrv_create_co_entry(&cco);
    538    } else {
    539        co = qemu_coroutine_create(bdrv_create_co_entry, &cco);
    540        qemu_coroutine_enter(co);
    541        while (cco.ret == NOT_DONE) {
    542            aio_poll(qemu_get_aio_context(), true);
    543        }
    544    }
    545
    546    ret = cco.ret;
    547    if (ret < 0) {
    548        if (cco.err) {
    549            error_propagate(errp, cco.err);
    550        } else {
    551            error_setg_errno(errp, -ret, "Could not create image");
    552        }
    553    }
    554
    555out:
    556    g_free(cco.filename);
    557    return ret;
    558}
    559
    560/**
    561 * Helper function for bdrv_create_file_fallback(): Resize @blk to at
    562 * least the given @minimum_size.
    563 *
    564 * On success, return @blk's actual length.
    565 * Otherwise, return -errno.
    566 */
    567static int64_t create_file_fallback_truncate(BlockBackend *blk,
    568                                             int64_t minimum_size, Error **errp)
    569{
    570    Error *local_err = NULL;
    571    int64_t size;
    572    int ret;
    573
    574    ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0,
    575                       &local_err);
    576    if (ret < 0 && ret != -ENOTSUP) {
    577        error_propagate(errp, local_err);
    578        return ret;
    579    }
    580
    581    size = blk_getlength(blk);
    582    if (size < 0) {
    583        error_free(local_err);
    584        error_setg_errno(errp, -size,
    585                         "Failed to inquire the new image file's length");
    586        return size;
    587    }
    588
    589    if (size < minimum_size) {
    590        /* Need to grow the image, but we failed to do that */
    591        error_propagate(errp, local_err);
    592        return -ENOTSUP;
    593    }
    594
    595    error_free(local_err);
    596    local_err = NULL;
    597
    598    return size;
    599}
    600
    601/**
    602 * Helper function for bdrv_create_file_fallback(): Zero the first
    603 * sector to remove any potentially pre-existing image header.
    604 */
    605static int create_file_fallback_zero_first_sector(BlockBackend *blk,
    606                                                  int64_t current_size,
    607                                                  Error **errp)
    608{
    609    int64_t bytes_to_clear;
    610    int ret;
    611
    612    bytes_to_clear = MIN(current_size, BDRV_SECTOR_SIZE);
    613    if (bytes_to_clear) {
    614        ret = blk_pwrite_zeroes(blk, 0, bytes_to_clear, BDRV_REQ_MAY_UNMAP);
    615        if (ret < 0) {
    616            error_setg_errno(errp, -ret,
    617                             "Failed to clear the new image's first sector");
    618            return ret;
    619        }
    620    }
    621
    622    return 0;
    623}
    624
    625/**
    626 * Simple implementation of bdrv_co_create_opts for protocol drivers
    627 * which only support creation via opening a file
    628 * (usually existing raw storage device)
    629 */
    630int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,
    631                                            const char *filename,
    632                                            QemuOpts *opts,
    633                                            Error **errp)
    634{
    635    BlockBackend *blk;
    636    QDict *options;
    637    int64_t size = 0;
    638    char *buf = NULL;
    639    PreallocMode prealloc;
    640    Error *local_err = NULL;
    641    int ret;
    642
    643    size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
    644    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
    645    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
    646                               PREALLOC_MODE_OFF, &local_err);
    647    g_free(buf);
    648    if (local_err) {
    649        error_propagate(errp, local_err);
    650        return -EINVAL;
    651    }
    652
    653    if (prealloc != PREALLOC_MODE_OFF) {
    654        error_setg(errp, "Unsupported preallocation mode '%s'",
    655                   PreallocMode_str(prealloc));
    656        return -ENOTSUP;
    657    }
    658
    659    options = qdict_new();
    660    qdict_put_str(options, "driver", drv->format_name);
    661
    662    blk = blk_new_open(filename, NULL, options,
    663                       BDRV_O_RDWR | BDRV_O_RESIZE, errp);
    664    if (!blk) {
    665        error_prepend(errp, "Protocol driver '%s' does not support image "
    666                      "creation, and opening the image failed: ",
    667                      drv->format_name);
    668        return -EINVAL;
    669    }
    670
    671    size = create_file_fallback_truncate(blk, size, errp);
    672    if (size < 0) {
    673        ret = size;
    674        goto out;
    675    }
    676
    677    ret = create_file_fallback_zero_first_sector(blk, size, errp);
    678    if (ret < 0) {
    679        goto out;
    680    }
    681
    682    ret = 0;
    683out:
    684    blk_unref(blk);
    685    return ret;
    686}
    687
    688int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
    689{
    690    QemuOpts *protocol_opts;
    691    BlockDriver *drv;
    692    QDict *qdict;
    693    int ret;
    694
    695    drv = bdrv_find_protocol(filename, true, errp);
    696    if (drv == NULL) {
    697        return -ENOENT;
    698    }
    699
    700    if (!drv->create_opts) {
    701        error_setg(errp, "Driver '%s' does not support image creation",
    702                   drv->format_name);
    703        return -ENOTSUP;
    704    }
    705
    706    /*
    707     * 'opts' contains a QemuOptsList with a combination of format and protocol
    708     * default values.
    709     *
    710     * The format properly removes its options, but the default values remain
    711     * in 'opts->list'.  So if the protocol has options with the same name
    712     * (e.g. rbd has 'cluster_size' as qcow2), it will see the default values
    713     * of the format, since for overlapping options, the format wins.
    714     *
    715     * To avoid this issue, lets convert QemuOpts to QDict, in this way we take
    716     * only the set options, and then convert it back to QemuOpts, using the
    717     * create_opts of the protocol. So the new QemuOpts, will contain only the
    718     * protocol defaults.
    719     */
    720    qdict = qemu_opts_to_qdict(opts, NULL);
    721    protocol_opts = qemu_opts_from_qdict(drv->create_opts, qdict, errp);
    722    if (protocol_opts == NULL) {
    723        ret = -EINVAL;
    724        goto out;
    725    }
    726
    727    ret = bdrv_create(drv, filename, protocol_opts, errp);
    728out:
    729    qemu_opts_del(protocol_opts);
    730    qobject_unref(qdict);
    731    return ret;
    732}
    733
    734int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp)
    735{
    736    Error *local_err = NULL;
    737    int ret;
    738
    739    assert(bs != NULL);
    740
    741    if (!bs->drv) {
    742        error_setg(errp, "Block node '%s' is not opened", bs->filename);
    743        return -ENOMEDIUM;
    744    }
    745
    746    if (!bs->drv->bdrv_co_delete_file) {
    747        error_setg(errp, "Driver '%s' does not support image deletion",
    748                   bs->drv->format_name);
    749        return -ENOTSUP;
    750    }
    751
    752    ret = bs->drv->bdrv_co_delete_file(bs, &local_err);
    753    if (ret < 0) {
    754        error_propagate(errp, local_err);
    755    }
    756
    757    return ret;
    758}
    759
    760void coroutine_fn bdrv_co_delete_file_noerr(BlockDriverState *bs)
    761{
    762    Error *local_err = NULL;
    763    int ret;
    764
    765    if (!bs) {
    766        return;
    767    }
    768
    769    ret = bdrv_co_delete_file(bs, &local_err);
    770    /*
    771     * ENOTSUP will happen if the block driver doesn't support
    772     * the 'bdrv_co_delete_file' interface. This is a predictable
    773     * scenario and shouldn't be reported back to the user.
    774     */
    775    if (ret == -ENOTSUP) {
    776        error_free(local_err);
    777    } else if (ret < 0) {
    778        error_report_err(local_err);
    779    }
    780}
    781
    782/**
    783 * Try to get @bs's logical and physical block size.
    784 * On success, store them in @bsz struct and return 0.
    785 * On failure return -errno.
    786 * @bs must not be empty.
    787 */
    788int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
    789{
    790    BlockDriver *drv = bs->drv;
    791    BlockDriverState *filtered = bdrv_filter_bs(bs);
    792
    793    if (drv && drv->bdrv_probe_blocksizes) {
    794        return drv->bdrv_probe_blocksizes(bs, bsz);
    795    } else if (filtered) {
    796        return bdrv_probe_blocksizes(filtered, bsz);
    797    }
    798
    799    return -ENOTSUP;
    800}
    801
    802/**
    803 * Try to get @bs's geometry (cyls, heads, sectors).
    804 * On success, store them in @geo struct and return 0.
    805 * On failure return -errno.
    806 * @bs must not be empty.
    807 */
    808int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
    809{
    810    BlockDriver *drv = bs->drv;
    811    BlockDriverState *filtered = bdrv_filter_bs(bs);
    812
    813    if (drv && drv->bdrv_probe_geometry) {
    814        return drv->bdrv_probe_geometry(bs, geo);
    815    } else if (filtered) {
    816        return bdrv_probe_geometry(filtered, geo);
    817    }
    818
    819    return -ENOTSUP;
    820}
    821
    822/*
    823 * Create a uniquely-named empty temporary file.
    824 * Return 0 upon success, otherwise a negative errno value.
    825 */
    826int get_tmp_filename(char *filename, int size)
    827{
    828#ifdef _WIN32
    829    char temp_dir[MAX_PATH];
    830    /* GetTempFileName requires that its output buffer (4th param)
    831       have length MAX_PATH or greater.  */
    832    assert(size >= MAX_PATH);
    833    return (GetTempPath(MAX_PATH, temp_dir)
    834            && GetTempFileName(temp_dir, "qem", 0, filename)
    835            ? 0 : -GetLastError());
    836#else
    837    int fd;
    838    const char *tmpdir;
    839    tmpdir = getenv("TMPDIR");
    840    if (!tmpdir) {
    841        tmpdir = "/var/tmp";
    842    }
    843    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
    844        return -EOVERFLOW;
    845    }
    846    fd = mkstemp(filename);
    847    if (fd < 0) {
    848        return -errno;
    849    }
    850    if (close(fd) != 0) {
    851        unlink(filename);
    852        return -errno;
    853    }
    854    return 0;
    855#endif
    856}
    857
    858/*
    859 * Detect host devices. By convention, /dev/cdrom[N] is always
    860 * recognized as a host CDROM.
    861 */
    862static BlockDriver *find_hdev_driver(const char *filename)
    863{
    864    int score_max = 0, score;
    865    BlockDriver *drv = NULL, *d;
    866
    867    QLIST_FOREACH(d, &bdrv_drivers, list) {
    868        if (d->bdrv_probe_device) {
    869            score = d->bdrv_probe_device(filename);
    870            if (score > score_max) {
    871                score_max = score;
    872                drv = d;
    873            }
    874        }
    875    }
    876
    877    return drv;
    878}
    879
    880static BlockDriver *bdrv_do_find_protocol(const char *protocol)
    881{
    882    BlockDriver *drv1;
    883
    884    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
    885        if (drv1->protocol_name && !strcmp(drv1->protocol_name, protocol)) {
    886            return drv1;
    887        }
    888    }
    889
    890    return NULL;
    891}
    892
    893BlockDriver *bdrv_find_protocol(const char *filename,
    894                                bool allow_protocol_prefix,
    895                                Error **errp)
    896{
    897    BlockDriver *drv1;
    898    char protocol[128];
    899    int len;
    900    const char *p;
    901    int i;
    902
    903    /* TODO Drivers without bdrv_file_open must be specified explicitly */
    904
    905    /*
    906     * XXX(hch): we really should not let host device detection
    907     * override an explicit protocol specification, but moving this
    908     * later breaks access to device names with colons in them.
    909     * Thanks to the brain-dead persistent naming schemes on udev-
    910     * based Linux systems those actually are quite common.
    911     */
    912    drv1 = find_hdev_driver(filename);
    913    if (drv1) {
    914        return drv1;
    915    }
    916
    917    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
    918        return &bdrv_file;
    919    }
    920
    921    p = strchr(filename, ':');
    922    assert(p != NULL);
    923    len = p - filename;
    924    if (len > sizeof(protocol) - 1)
    925        len = sizeof(protocol) - 1;
    926    memcpy(protocol, filename, len);
    927    protocol[len] = '\0';
    928
    929    drv1 = bdrv_do_find_protocol(protocol);
    930    if (drv1) {
    931        return drv1;
    932    }
    933
    934    for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); ++i) {
    935        if (block_driver_modules[i].protocol_name &&
    936            !strcmp(block_driver_modules[i].protocol_name, protocol)) {
    937            block_module_load_one(block_driver_modules[i].library_name);
    938            break;
    939        }
    940    }
    941
    942    drv1 = bdrv_do_find_protocol(protocol);
    943    if (!drv1) {
    944        error_setg(errp, "Unknown protocol '%s'", protocol);
    945    }
    946    return drv1;
    947}
    948
    949/*
    950 * Guess image format by probing its contents.
    951 * This is not a good idea when your image is raw (CVE-2008-2004), but
    952 * we do it anyway for backward compatibility.
    953 *
    954 * @buf         contains the image's first @buf_size bytes.
    955 * @buf_size    is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
    956 *              but can be smaller if the image file is smaller)
    957 * @filename    is its filename.
    958 *
    959 * For all block drivers, call the bdrv_probe() method to get its
    960 * probing score.
    961 * Return the first block driver with the highest probing score.
    962 */
    963BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
    964                            const char *filename)
    965{
    966    int score_max = 0, score;
    967    BlockDriver *drv = NULL, *d;
    968
    969    QLIST_FOREACH(d, &bdrv_drivers, list) {
    970        if (d->bdrv_probe) {
    971            score = d->bdrv_probe(buf, buf_size, filename);
    972            if (score > score_max) {
    973                score_max = score;
    974                drv = d;
    975            }
    976        }
    977    }
    978
    979    return drv;
    980}
    981
    982static int find_image_format(BlockBackend *file, const char *filename,
    983                             BlockDriver **pdrv, Error **errp)
    984{
    985    BlockDriver *drv;
    986    uint8_t buf[BLOCK_PROBE_BUF_SIZE];
    987    int ret = 0;
    988
    989    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
    990    if (blk_is_sg(file) || !blk_is_inserted(file) || blk_getlength(file) == 0) {
    991        *pdrv = &bdrv_raw;
    992        return ret;
    993    }
    994
    995    ret = blk_pread(file, 0, buf, sizeof(buf));
    996    if (ret < 0) {
    997        error_setg_errno(errp, -ret, "Could not read image for determining its "
    998                         "format");
    999        *pdrv = NULL;
   1000        return ret;
   1001    }
   1002
   1003    drv = bdrv_probe_all(buf, ret, filename);
   1004    if (!drv) {
   1005        error_setg(errp, "Could not determine image format: No compatible "
   1006                   "driver found");
   1007        ret = -ENOENT;
   1008    }
   1009    *pdrv = drv;
   1010    return ret;
   1011}
   1012
   1013/**
   1014 * Set the current 'total_sectors' value
   1015 * Return 0 on success, -errno on error.
   1016 */
   1017int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
   1018{
   1019    BlockDriver *drv = bs->drv;
   1020
   1021    if (!drv) {
   1022        return -ENOMEDIUM;
   1023    }
   1024
   1025    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
   1026    if (bdrv_is_sg(bs))
   1027        return 0;
   1028
   1029    /* query actual device if possible, otherwise just trust the hint */
   1030    if (drv->bdrv_getlength) {
   1031        int64_t length = drv->bdrv_getlength(bs);
   1032        if (length < 0) {
   1033            return length;
   1034        }
   1035        hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
   1036    }
   1037
   1038    bs->total_sectors = hint;
   1039
   1040    if (bs->total_sectors * BDRV_SECTOR_SIZE > BDRV_MAX_LENGTH) {
   1041        return -EFBIG;
   1042    }
   1043
   1044    return 0;
   1045}
   1046
   1047/**
   1048 * Combines a QDict of new block driver @options with any missing options taken
   1049 * from @old_options, so that leaving out an option defaults to its old value.
   1050 */
   1051static void bdrv_join_options(BlockDriverState *bs, QDict *options,
   1052                              QDict *old_options)
   1053{
   1054    if (bs->drv && bs->drv->bdrv_join_options) {
   1055        bs->drv->bdrv_join_options(options, old_options);
   1056    } else {
   1057        qdict_join(options, old_options, false);
   1058    }
   1059}
   1060
   1061static BlockdevDetectZeroesOptions bdrv_parse_detect_zeroes(QemuOpts *opts,
   1062                                                            int open_flags,
   1063                                                            Error **errp)
   1064{
   1065    Error *local_err = NULL;
   1066    char *value = qemu_opt_get_del(opts, "detect-zeroes");
   1067    BlockdevDetectZeroesOptions detect_zeroes =
   1068        qapi_enum_parse(&BlockdevDetectZeroesOptions_lookup, value,
   1069                        BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF, &local_err);
   1070    g_free(value);
   1071    if (local_err) {
   1072        error_propagate(errp, local_err);
   1073        return detect_zeroes;
   1074    }
   1075
   1076    if (detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP &&
   1077        !(open_flags & BDRV_O_UNMAP))
   1078    {
   1079        error_setg(errp, "setting detect-zeroes to unmap is not allowed "
   1080                   "without setting discard operation to unmap");
   1081    }
   1082
   1083    return detect_zeroes;
   1084}
   1085
   1086/**
   1087 * Set open flags for aio engine
   1088 *
   1089 * Return 0 on success, -1 if the engine specified is invalid
   1090 */
   1091int bdrv_parse_aio(const char *mode, int *flags)
   1092{
   1093    if (!strcmp(mode, "threads")) {
   1094        /* do nothing, default */
   1095    } else if (!strcmp(mode, "native")) {
   1096        *flags |= BDRV_O_NATIVE_AIO;
   1097#ifdef CONFIG_LINUX_IO_URING
   1098    } else if (!strcmp(mode, "io_uring")) {
   1099        *flags |= BDRV_O_IO_URING;
   1100#endif
   1101    } else {
   1102        return -1;
   1103    }
   1104
   1105    return 0;
   1106}
   1107
   1108/**
   1109 * Set open flags for a given discard mode
   1110 *
   1111 * Return 0 on success, -1 if the discard mode was invalid.
   1112 */
   1113int bdrv_parse_discard_flags(const char *mode, int *flags)
   1114{
   1115    *flags &= ~BDRV_O_UNMAP;
   1116
   1117    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
   1118        /* do nothing */
   1119    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
   1120        *flags |= BDRV_O_UNMAP;
   1121    } else {
   1122        return -1;
   1123    }
   1124
   1125    return 0;
   1126}
   1127
   1128/**
   1129 * Set open flags for a given cache mode
   1130 *
   1131 * Return 0 on success, -1 if the cache mode was invalid.
   1132 */
   1133int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough)
   1134{
   1135    *flags &= ~BDRV_O_CACHE_MASK;
   1136
   1137    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
   1138        *writethrough = false;
   1139        *flags |= BDRV_O_NOCACHE;
   1140    } else if (!strcmp(mode, "directsync")) {
   1141        *writethrough = true;
   1142        *flags |= BDRV_O_NOCACHE;
   1143    } else if (!strcmp(mode, "writeback")) {
   1144        *writethrough = false;
   1145    } else if (!strcmp(mode, "unsafe")) {
   1146        *writethrough = false;
   1147        *flags |= BDRV_O_NO_FLUSH;
   1148    } else if (!strcmp(mode, "writethrough")) {
   1149        *writethrough = true;
   1150    } else {
   1151        return -1;
   1152    }
   1153
   1154    return 0;
   1155}
   1156
   1157static char *bdrv_child_get_parent_desc(BdrvChild *c)
   1158{
   1159    BlockDriverState *parent = c->opaque;
   1160    return g_strdup_printf("node '%s'", bdrv_get_node_name(parent));
   1161}
   1162
   1163static void bdrv_child_cb_drained_begin(BdrvChild *child)
   1164{
   1165    BlockDriverState *bs = child->opaque;
   1166    bdrv_do_drained_begin_quiesce(bs, NULL, false);
   1167}
   1168
   1169static bool bdrv_child_cb_drained_poll(BdrvChild *child)
   1170{
   1171    BlockDriverState *bs = child->opaque;
   1172    return bdrv_drain_poll(bs, false, NULL, false);
   1173}
   1174
   1175static void bdrv_child_cb_drained_end(BdrvChild *child,
   1176                                      int *drained_end_counter)
   1177{
   1178    BlockDriverState *bs = child->opaque;
   1179    bdrv_drained_end_no_poll(bs, drained_end_counter);
   1180}
   1181
   1182static int bdrv_child_cb_inactivate(BdrvChild *child)
   1183{
   1184    BlockDriverState *bs = child->opaque;
   1185    assert(bs->open_flags & BDRV_O_INACTIVE);
   1186    return 0;
   1187}
   1188
   1189static bool bdrv_child_cb_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
   1190                                          GSList **ignore, Error **errp)
   1191{
   1192    BlockDriverState *bs = child->opaque;
   1193    return bdrv_can_set_aio_context(bs, ctx, ignore, errp);
   1194}
   1195
   1196static void bdrv_child_cb_set_aio_ctx(BdrvChild *child, AioContext *ctx,
   1197                                      GSList **ignore)
   1198{
   1199    BlockDriverState *bs = child->opaque;
   1200    return bdrv_set_aio_context_ignore(bs, ctx, ignore);
   1201}
   1202
   1203/*
   1204 * Returns the options and flags that a temporary snapshot should get, based on
   1205 * the originally requested flags (the originally requested image will have
   1206 * flags like a backing file)
   1207 */
   1208static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options,
   1209                                       int parent_flags, QDict *parent_options)
   1210{
   1211    *child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
   1212
   1213    /* For temporary files, unconditional cache=unsafe is fine */
   1214    qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off");
   1215    qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on");
   1216
   1217    /* Copy the read-only and discard options from the parent */
   1218    qdict_copy_default(child_options, parent_options, BDRV_OPT_READ_ONLY);
   1219    qdict_copy_default(child_options, parent_options, BDRV_OPT_DISCARD);
   1220
   1221    /* aio=native doesn't work for cache.direct=off, so disable it for the
   1222     * temporary snapshot */
   1223    *child_flags &= ~BDRV_O_NATIVE_AIO;
   1224}
   1225
   1226static void bdrv_backing_attach(BdrvChild *c)
   1227{
   1228    BlockDriverState *parent = c->opaque;
   1229    BlockDriverState *backing_hd = c->bs;
   1230
   1231    assert(!parent->backing_blocker);
   1232    error_setg(&parent->backing_blocker,
   1233               "node is used as backing hd of '%s'",
   1234               bdrv_get_device_or_node_name(parent));
   1235
   1236    bdrv_refresh_filename(backing_hd);
   1237
   1238    parent->open_flags &= ~BDRV_O_NO_BACKING;
   1239
   1240    bdrv_op_block_all(backing_hd, parent->backing_blocker);
   1241    /* Otherwise we won't be able to commit or stream */
   1242    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
   1243                    parent->backing_blocker);
   1244    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_STREAM,
   1245                    parent->backing_blocker);
   1246    /*
   1247     * We do backup in 3 ways:
   1248     * 1. drive backup
   1249     *    The target bs is new opened, and the source is top BDS
   1250     * 2. blockdev backup
   1251     *    Both the source and the target are top BDSes.
   1252     * 3. internal backup(used for block replication)
   1253     *    Both the source and the target are backing file
   1254     *
   1255     * In case 1 and 2, neither the source nor the target is the backing file.
   1256     * In case 3, we will block the top BDS, so there is only one block job
   1257     * for the top BDS and its backing chain.
   1258     */
   1259    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_SOURCE,
   1260                    parent->backing_blocker);
   1261    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
   1262                    parent->backing_blocker);
   1263}
   1264
   1265static void bdrv_backing_detach(BdrvChild *c)
   1266{
   1267    BlockDriverState *parent = c->opaque;
   1268
   1269    assert(parent->backing_blocker);
   1270    bdrv_op_unblock_all(c->bs, parent->backing_blocker);
   1271    error_free(parent->backing_blocker);
   1272    parent->backing_blocker = NULL;
   1273}
   1274
   1275static int bdrv_backing_update_filename(BdrvChild *c, BlockDriverState *base,
   1276                                        const char *filename, Error **errp)
   1277{
   1278    BlockDriverState *parent = c->opaque;
   1279    bool read_only = bdrv_is_read_only(parent);
   1280    int ret;
   1281
   1282    if (read_only) {
   1283        ret = bdrv_reopen_set_read_only(parent, false, errp);
   1284        if (ret < 0) {
   1285            return ret;
   1286        }
   1287    }
   1288
   1289    ret = bdrv_change_backing_file(parent, filename,
   1290                                   base->drv ? base->drv->format_name : "",
   1291                                   false);
   1292    if (ret < 0) {
   1293        error_setg_errno(errp, -ret, "Could not update backing file link");
   1294    }
   1295
   1296    if (read_only) {
   1297        bdrv_reopen_set_read_only(parent, true, NULL);
   1298    }
   1299
   1300    return ret;
   1301}
   1302
   1303/*
   1304 * Returns the options and flags that a generic child of a BDS should
   1305 * get, based on the given options and flags for the parent BDS.
   1306 */
   1307static void bdrv_inherited_options(BdrvChildRole role, bool parent_is_format,
   1308                                   int *child_flags, QDict *child_options,
   1309                                   int parent_flags, QDict *parent_options)
   1310{
   1311    int flags = parent_flags;
   1312
   1313    /*
   1314     * First, decide whether to set, clear, or leave BDRV_O_PROTOCOL.
   1315     * Generally, the question to answer is: Should this child be
   1316     * format-probed by default?
   1317     */
   1318
   1319    /*
   1320     * Pure and non-filtered data children of non-format nodes should
   1321     * be probed by default (even when the node itself has BDRV_O_PROTOCOL
   1322     * set).  This only affects a very limited set of drivers (namely
   1323     * quorum and blkverify when this comment was written).
   1324     * Force-clear BDRV_O_PROTOCOL then.
   1325     */
   1326    if (!parent_is_format &&
   1327        (role & BDRV_CHILD_DATA) &&
   1328        !(role & (BDRV_CHILD_METADATA | BDRV_CHILD_FILTERED)))
   1329    {
   1330        flags &= ~BDRV_O_PROTOCOL;
   1331    }
   1332
   1333    /*
   1334     * All children of format nodes (except for COW children) and all
   1335     * metadata children in general should never be format-probed.
   1336     * Force-set BDRV_O_PROTOCOL then.
   1337     */
   1338    if ((parent_is_format && !(role & BDRV_CHILD_COW)) ||
   1339        (role & BDRV_CHILD_METADATA))
   1340    {
   1341        flags |= BDRV_O_PROTOCOL;
   1342    }
   1343
   1344    /*
   1345     * If the cache mode isn't explicitly set, inherit direct and no-flush from
   1346     * the parent.
   1347     */
   1348    qdict_copy_default(child_options, parent_options, BDRV_OPT_CACHE_DIRECT);
   1349    qdict_copy_default(child_options, parent_options, BDRV_OPT_CACHE_NO_FLUSH);
   1350    qdict_copy_default(child_options, parent_options, BDRV_OPT_FORCE_SHARE);
   1351
   1352    if (role & BDRV_CHILD_COW) {
   1353        /* backing files are opened read-only by default */
   1354        qdict_set_default_str(child_options, BDRV_OPT_READ_ONLY, "on");
   1355        qdict_set_default_str(child_options, BDRV_OPT_AUTO_READ_ONLY, "off");
   1356    } else {
   1357        /* Inherit the read-only option from the parent if it's not set */
   1358        qdict_copy_default(child_options, parent_options, BDRV_OPT_READ_ONLY);
   1359        qdict_copy_default(child_options, parent_options,
   1360                           BDRV_OPT_AUTO_READ_ONLY);
   1361    }
   1362
   1363    /*
   1364     * bdrv_co_pdiscard() respects unmap policy for the parent, so we
   1365     * can default to enable it on lower layers regardless of the
   1366     * parent option.
   1367     */
   1368    qdict_set_default_str(child_options, BDRV_OPT_DISCARD, "unmap");
   1369
   1370    /* Clear flags that only apply to the top layer */
   1371    flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
   1372
   1373    if (role & BDRV_CHILD_METADATA) {
   1374        flags &= ~BDRV_O_NO_IO;
   1375    }
   1376    if (role & BDRV_CHILD_COW) {
   1377        flags &= ~BDRV_O_TEMPORARY;
   1378    }
   1379
   1380    *child_flags = flags;
   1381}
   1382
   1383static void bdrv_child_cb_attach(BdrvChild *child)
   1384{
   1385    BlockDriverState *bs = child->opaque;
   1386
   1387    if (child->role & BDRV_CHILD_COW) {
   1388        bdrv_backing_attach(child);
   1389    }
   1390
   1391    bdrv_apply_subtree_drain(child, bs);
   1392}
   1393
   1394static void bdrv_child_cb_detach(BdrvChild *child)
   1395{
   1396    BlockDriverState *bs = child->opaque;
   1397
   1398    if (child->role & BDRV_CHILD_COW) {
   1399        bdrv_backing_detach(child);
   1400    }
   1401
   1402    bdrv_unapply_subtree_drain(child, bs);
   1403}
   1404
   1405static int bdrv_child_cb_update_filename(BdrvChild *c, BlockDriverState *base,
   1406                                         const char *filename, Error **errp)
   1407{
   1408    if (c->role & BDRV_CHILD_COW) {
   1409        return bdrv_backing_update_filename(c, base, filename, errp);
   1410    }
   1411    return 0;
   1412}
   1413
   1414AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c)
   1415{
   1416    BlockDriverState *bs = c->opaque;
   1417
   1418    return bdrv_get_aio_context(bs);
   1419}
   1420
   1421const BdrvChildClass child_of_bds = {
   1422    .parent_is_bds   = true,
   1423    .get_parent_desc = bdrv_child_get_parent_desc,
   1424    .inherit_options = bdrv_inherited_options,
   1425    .drained_begin   = bdrv_child_cb_drained_begin,
   1426    .drained_poll    = bdrv_child_cb_drained_poll,
   1427    .drained_end     = bdrv_child_cb_drained_end,
   1428    .attach          = bdrv_child_cb_attach,
   1429    .detach          = bdrv_child_cb_detach,
   1430    .inactivate      = bdrv_child_cb_inactivate,
   1431    .can_set_aio_ctx = bdrv_child_cb_can_set_aio_ctx,
   1432    .set_aio_ctx     = bdrv_child_cb_set_aio_ctx,
   1433    .update_filename = bdrv_child_cb_update_filename,
   1434    .get_parent_aio_context = child_of_bds_get_parent_aio_context,
   1435};
   1436
   1437AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c)
   1438{
   1439    return c->klass->get_parent_aio_context(c);
   1440}
   1441
   1442static int bdrv_open_flags(BlockDriverState *bs, int flags)
   1443{
   1444    int open_flags = flags;
   1445
   1446    /*
   1447     * Clear flags that are internal to the block layer before opening the
   1448     * image.
   1449     */
   1450    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
   1451
   1452    return open_flags;
   1453}
   1454
   1455static void update_flags_from_options(int *flags, QemuOpts *opts)
   1456{
   1457    *flags &= ~(BDRV_O_CACHE_MASK | BDRV_O_RDWR | BDRV_O_AUTO_RDONLY);
   1458
   1459    if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_NO_FLUSH, false)) {
   1460        *flags |= BDRV_O_NO_FLUSH;
   1461    }
   1462
   1463    if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_DIRECT, false)) {
   1464        *flags |= BDRV_O_NOCACHE;
   1465    }
   1466
   1467    if (!qemu_opt_get_bool_del(opts, BDRV_OPT_READ_ONLY, false)) {
   1468        *flags |= BDRV_O_RDWR;
   1469    }
   1470
   1471    if (qemu_opt_get_bool_del(opts, BDRV_OPT_AUTO_READ_ONLY, false)) {
   1472        *flags |= BDRV_O_AUTO_RDONLY;
   1473    }
   1474}
   1475
   1476static void update_options_from_flags(QDict *options, int flags)
   1477{
   1478    if (!qdict_haskey(options, BDRV_OPT_CACHE_DIRECT)) {
   1479        qdict_put_bool(options, BDRV_OPT_CACHE_DIRECT, flags & BDRV_O_NOCACHE);
   1480    }
   1481    if (!qdict_haskey(options, BDRV_OPT_CACHE_NO_FLUSH)) {
   1482        qdict_put_bool(options, BDRV_OPT_CACHE_NO_FLUSH,
   1483                       flags & BDRV_O_NO_FLUSH);
   1484    }
   1485    if (!qdict_haskey(options, BDRV_OPT_READ_ONLY)) {
   1486        qdict_put_bool(options, BDRV_OPT_READ_ONLY, !(flags & BDRV_O_RDWR));
   1487    }
   1488    if (!qdict_haskey(options, BDRV_OPT_AUTO_READ_ONLY)) {
   1489        qdict_put_bool(options, BDRV_OPT_AUTO_READ_ONLY,
   1490                       flags & BDRV_O_AUTO_RDONLY);
   1491    }
   1492}
   1493
   1494static void bdrv_assign_node_name(BlockDriverState *bs,
   1495                                  const char *node_name,
   1496                                  Error **errp)
   1497{
   1498    char *gen_node_name = NULL;
   1499
   1500    if (!node_name) {
   1501        node_name = gen_node_name = id_generate(ID_BLOCK);
   1502    } else if (!id_wellformed(node_name)) {
   1503        /*
   1504         * Check for empty string or invalid characters, but not if it is
   1505         * generated (generated names use characters not available to the user)
   1506         */
   1507        error_setg(errp, "Invalid node-name: '%s'", node_name);
   1508        return;
   1509    }
   1510
   1511    /* takes care of avoiding namespaces collisions */
   1512    if (blk_by_name(node_name)) {
   1513        error_setg(errp, "node-name=%s is conflicting with a device id",
   1514                   node_name);
   1515        goto out;
   1516    }
   1517
   1518    /* takes care of avoiding duplicates node names */
   1519    if (bdrv_find_node(node_name)) {
   1520        error_setg(errp, "Duplicate nodes with node-name='%s'", node_name);
   1521        goto out;
   1522    }
   1523
   1524    /* Make sure that the node name isn't truncated */
   1525    if (strlen(node_name) >= sizeof(bs->node_name)) {
   1526        error_setg(errp, "Node name too long");
   1527        goto out;
   1528    }
   1529
   1530    /* copy node name into the bs and insert it into the graph list */
   1531    pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
   1532    QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
   1533out:
   1534    g_free(gen_node_name);
   1535}
   1536
   1537static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
   1538                            const char *node_name, QDict *options,
   1539                            int open_flags, Error **errp)
   1540{
   1541    Error *local_err = NULL;
   1542    int i, ret;
   1543
   1544    bdrv_assign_node_name(bs, node_name, &local_err);
   1545    if (local_err) {
   1546        error_propagate(errp, local_err);
   1547        return -EINVAL;
   1548    }
   1549
   1550    bs->drv = drv;
   1551    bs->opaque = g_malloc0(drv->instance_size);
   1552
   1553    if (drv->bdrv_file_open) {
   1554        assert(!drv->bdrv_needs_filename || bs->filename[0]);
   1555        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
   1556    } else if (drv->bdrv_open) {
   1557        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
   1558    } else {
   1559        ret = 0;
   1560    }
   1561
   1562    if (ret < 0) {
   1563        if (local_err) {
   1564            error_propagate(errp, local_err);
   1565        } else if (bs->filename[0]) {
   1566            error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
   1567        } else {
   1568            error_setg_errno(errp, -ret, "Could not open image");
   1569        }
   1570        goto open_failed;
   1571    }
   1572
   1573    ret = refresh_total_sectors(bs, bs->total_sectors);
   1574    if (ret < 0) {
   1575        error_setg_errno(errp, -ret, "Could not refresh total sector count");
   1576        return ret;
   1577    }
   1578
   1579    bdrv_refresh_limits(bs, NULL, &local_err);
   1580    if (local_err) {
   1581        error_propagate(errp, local_err);
   1582        return -EINVAL;
   1583    }
   1584
   1585    assert(bdrv_opt_mem_align(bs) != 0);
   1586    assert(bdrv_min_mem_align(bs) != 0);
   1587    assert(is_power_of_2(bs->bl.request_alignment));
   1588
   1589    for (i = 0; i < bs->quiesce_counter; i++) {
   1590        if (drv->bdrv_co_drain_begin) {
   1591            drv->bdrv_co_drain_begin(bs);
   1592        }
   1593    }
   1594
   1595    return 0;
   1596open_failed:
   1597    bs->drv = NULL;
   1598    if (bs->file != NULL) {
   1599        bdrv_unref_child(bs, bs->file);
   1600        bs->file = NULL;
   1601    }
   1602    g_free(bs->opaque);
   1603    bs->opaque = NULL;
   1604    return ret;
   1605}
   1606
   1607/*
   1608 * Create and open a block node.
   1609 *
   1610 * @options is a QDict of options to pass to the block drivers, or NULL for an
   1611 * empty set of options. The reference to the QDict belongs to the block layer
   1612 * after the call (even on failure), so if the caller intends to reuse the
   1613 * dictionary, it needs to use qobject_ref() before calling bdrv_open.
   1614 */
   1615BlockDriverState *bdrv_new_open_driver_opts(BlockDriver *drv,
   1616                                            const char *node_name,
   1617                                            QDict *options, int flags,
   1618                                            Error **errp)
   1619{
   1620    BlockDriverState *bs;
   1621    int ret;
   1622
   1623    bs = bdrv_new();
   1624    bs->open_flags = flags;
   1625    bs->options = options ?: qdict_new();
   1626    bs->explicit_options = qdict_clone_shallow(bs->options);
   1627    bs->opaque = NULL;
   1628
   1629    update_options_from_flags(bs->options, flags);
   1630
   1631    ret = bdrv_open_driver(bs, drv, node_name, bs->options, flags, errp);
   1632    if (ret < 0) {
   1633        qobject_unref(bs->explicit_options);
   1634        bs->explicit_options = NULL;
   1635        qobject_unref(bs->options);
   1636        bs->options = NULL;
   1637        bdrv_unref(bs);
   1638        return NULL;
   1639    }
   1640
   1641    return bs;
   1642}
   1643
   1644/* Create and open a block node. */
   1645BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
   1646                                       int flags, Error **errp)
   1647{
   1648    return bdrv_new_open_driver_opts(drv, node_name, NULL, flags, errp);
   1649}
   1650
   1651QemuOptsList bdrv_runtime_opts = {
   1652    .name = "bdrv_common",
   1653    .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
   1654    .desc = {
   1655        {
   1656            .name = "node-name",
   1657            .type = QEMU_OPT_STRING,
   1658            .help = "Node name of the block device node",
   1659        },
   1660        {
   1661            .name = "driver",
   1662            .type = QEMU_OPT_STRING,
   1663            .help = "Block driver to use for the node",
   1664        },
   1665        {
   1666            .name = BDRV_OPT_CACHE_DIRECT,
   1667            .type = QEMU_OPT_BOOL,
   1668            .help = "Bypass software writeback cache on the host",
   1669        },
   1670        {
   1671            .name = BDRV_OPT_CACHE_NO_FLUSH,
   1672            .type = QEMU_OPT_BOOL,
   1673            .help = "Ignore flush requests",
   1674        },
   1675        {
   1676            .name = BDRV_OPT_READ_ONLY,
   1677            .type = QEMU_OPT_BOOL,
   1678            .help = "Node is opened in read-only mode",
   1679        },
   1680        {
   1681            .name = BDRV_OPT_AUTO_READ_ONLY,
   1682            .type = QEMU_OPT_BOOL,
   1683            .help = "Node can become read-only if opening read-write fails",
   1684        },
   1685        {
   1686            .name = "detect-zeroes",
   1687            .type = QEMU_OPT_STRING,
   1688            .help = "try to optimize zero writes (off, on, unmap)",
   1689        },
   1690        {
   1691            .name = BDRV_OPT_DISCARD,
   1692            .type = QEMU_OPT_STRING,
   1693            .help = "discard operation (ignore/off, unmap/on)",
   1694        },
   1695        {
   1696            .name = BDRV_OPT_FORCE_SHARE,
   1697            .type = QEMU_OPT_BOOL,
   1698            .help = "always accept other writers (default: off)",
   1699        },
   1700        { /* end of list */ }
   1701    },
   1702};
   1703
   1704QemuOptsList bdrv_create_opts_simple = {
   1705    .name = "simple-create-opts",
   1706    .head = QTAILQ_HEAD_INITIALIZER(bdrv_create_opts_simple.head),
   1707    .desc = {
   1708        {
   1709            .name = BLOCK_OPT_SIZE,
   1710            .type = QEMU_OPT_SIZE,
   1711            .help = "Virtual disk size"
   1712        },
   1713        {
   1714            .name = BLOCK_OPT_PREALLOC,
   1715            .type = QEMU_OPT_STRING,
   1716            .help = "Preallocation mode (allowed values: off)"
   1717        },
   1718        { /* end of list */ }
   1719    }
   1720};
   1721
   1722/*
   1723 * Common part for opening disk images and files
   1724 *
   1725 * Removes all processed options from *options.
   1726 */
   1727static int bdrv_open_common(BlockDriverState *bs, BlockBackend *file,
   1728                            QDict *options, Error **errp)
   1729{
   1730    int ret, open_flags;
   1731    const char *filename;
   1732    const char *driver_name = NULL;
   1733    const char *node_name = NULL;
   1734    const char *discard;
   1735    QemuOpts *opts;
   1736    BlockDriver *drv;
   1737    Error *local_err = NULL;
   1738    bool ro;
   1739
   1740    assert(bs->file == NULL);
   1741    assert(options != NULL && bs->options != options);
   1742
   1743    opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
   1744    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
   1745        ret = -EINVAL;
   1746        goto fail_opts;
   1747    }
   1748
   1749    update_flags_from_options(&bs->open_flags, opts);
   1750
   1751    driver_name = qemu_opt_get(opts, "driver");
   1752    drv = bdrv_find_format(driver_name);
   1753    assert(drv != NULL);
   1754
   1755    bs->force_share = qemu_opt_get_bool(opts, BDRV_OPT_FORCE_SHARE, false);
   1756
   1757    if (bs->force_share && (bs->open_flags & BDRV_O_RDWR)) {
   1758        error_setg(errp,
   1759                   BDRV_OPT_FORCE_SHARE
   1760                   "=on can only be used with read-only images");
   1761        ret = -EINVAL;
   1762        goto fail_opts;
   1763    }
   1764
   1765    if (file != NULL) {
   1766        bdrv_refresh_filename(blk_bs(file));
   1767        filename = blk_bs(file)->filename;
   1768    } else {
   1769        /*
   1770         * Caution: while qdict_get_try_str() is fine, getting
   1771         * non-string types would require more care.  When @options
   1772         * come from -blockdev or blockdev_add, its members are typed
   1773         * according to the QAPI schema, but when they come from
   1774         * -drive, they're all QString.
   1775         */
   1776        filename = qdict_get_try_str(options, "filename");
   1777    }
   1778
   1779    if (drv->bdrv_needs_filename && (!filename || !filename[0])) {
   1780        error_setg(errp, "The '%s' block driver requires a file name",
   1781                   drv->format_name);
   1782        ret = -EINVAL;
   1783        goto fail_opts;
   1784    }
   1785
   1786    trace_bdrv_open_common(bs, filename ?: "", bs->open_flags,
   1787                           drv->format_name);
   1788
   1789    ro = bdrv_is_read_only(bs);
   1790
   1791    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, ro)) {
   1792        if (!ro && bdrv_is_whitelisted(drv, true)) {
   1793            ret = bdrv_apply_auto_read_only(bs, NULL, NULL);
   1794        } else {
   1795            ret = -ENOTSUP;
   1796        }
   1797        if (ret < 0) {
   1798            error_setg(errp,
   1799                       !ro && bdrv_is_whitelisted(drv, true)
   1800                       ? "Driver '%s' can only be used for read-only devices"
   1801                       : "Driver '%s' is not whitelisted",
   1802                       drv->format_name);
   1803            goto fail_opts;
   1804        }
   1805    }
   1806
   1807    /* bdrv_new() and bdrv_close() make it so */
   1808    assert(qatomic_read(&bs->copy_on_read) == 0);
   1809
   1810    if (bs->open_flags & BDRV_O_COPY_ON_READ) {
   1811        if (!ro) {
   1812            bdrv_enable_copy_on_read(bs);
   1813        } else {
   1814            error_setg(errp, "Can't use copy-on-read on read-only device");
   1815            ret = -EINVAL;
   1816            goto fail_opts;
   1817        }
   1818    }
   1819
   1820    discard = qemu_opt_get(opts, BDRV_OPT_DISCARD);
   1821    if (discard != NULL) {
   1822        if (bdrv_parse_discard_flags(discard, &bs->open_flags) != 0) {
   1823            error_setg(errp, "Invalid discard option");
   1824            ret = -EINVAL;
   1825            goto fail_opts;
   1826        }
   1827    }
   1828
   1829    bs->detect_zeroes =
   1830        bdrv_parse_detect_zeroes(opts, bs->open_flags, &local_err);
   1831    if (local_err) {
   1832        error_propagate(errp, local_err);
   1833        ret = -EINVAL;
   1834        goto fail_opts;
   1835    }
   1836
   1837    if (filename != NULL) {
   1838        pstrcpy(bs->filename, sizeof(bs->filename), filename);
   1839    } else {
   1840        bs->filename[0] = '\0';
   1841    }
   1842    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
   1843
   1844    /* Open the image, either directly or using a protocol */
   1845    open_flags = bdrv_open_flags(bs, bs->open_flags);
   1846    node_name = qemu_opt_get(opts, "node-name");
   1847
   1848    assert(!drv->bdrv_file_open || file == NULL);
   1849    ret = bdrv_open_driver(bs, drv, node_name, options, open_flags, errp);
   1850    if (ret < 0) {
   1851        goto fail_opts;
   1852    }
   1853
   1854    qemu_opts_del(opts);
   1855    return 0;
   1856
   1857fail_opts:
   1858    qemu_opts_del(opts);
   1859    return ret;
   1860}
   1861
   1862static QDict *parse_json_filename(const char *filename, Error **errp)
   1863{
   1864    QObject *options_obj;
   1865    QDict *options;
   1866    int ret;
   1867
   1868    ret = strstart(filename, "json:", &filename);
   1869    assert(ret);
   1870
   1871    options_obj = qobject_from_json(filename, errp);
   1872    if (!options_obj) {
   1873        error_prepend(errp, "Could not parse the JSON options: ");
   1874        return NULL;
   1875    }
   1876
   1877    options = qobject_to(QDict, options_obj);
   1878    if (!options) {
   1879        qobject_unref(options_obj);
   1880        error_setg(errp, "Invalid JSON object given");
   1881        return NULL;
   1882    }
   1883
   1884    qdict_flatten(options);
   1885
   1886    return options;
   1887}
   1888
   1889static void parse_json_protocol(QDict *options, const char **pfilename,
   1890                                Error **errp)
   1891{
   1892    QDict *json_options;
   1893    Error *local_err = NULL;
   1894
   1895    /* Parse json: pseudo-protocol */
   1896    if (!*pfilename || !g_str_has_prefix(*pfilename, "json:")) {
   1897        return;
   1898    }
   1899
   1900    json_options = parse_json_filename(*pfilename, &local_err);
   1901    if (local_err) {
   1902        error_propagate(errp, local_err);
   1903        return;
   1904    }
   1905
   1906    /* Options given in the filename have lower priority than options
   1907     * specified directly */
   1908    qdict_join(options, json_options, false);
   1909    qobject_unref(json_options);
   1910    *pfilename = NULL;
   1911}
   1912
   1913/*
   1914 * Fills in default options for opening images and converts the legacy
   1915 * filename/flags pair to option QDict entries.
   1916 * The BDRV_O_PROTOCOL flag in *flags will be set or cleared accordingly if a
   1917 * block driver has been specified explicitly.
   1918 */
   1919static int bdrv_fill_options(QDict **options, const char *filename,
   1920                             int *flags, Error **errp)
   1921{
   1922    const char *drvname;
   1923    bool protocol = *flags & BDRV_O_PROTOCOL;
   1924    bool parse_filename = false;
   1925    BlockDriver *drv = NULL;
   1926    Error *local_err = NULL;
   1927
   1928    /*
   1929     * Caution: while qdict_get_try_str() is fine, getting non-string
   1930     * types would require more care.  When @options come from
   1931     * -blockdev or blockdev_add, its members are typed according to
   1932     * the QAPI schema, but when they come from -drive, they're all
   1933     * QString.
   1934     */
   1935    drvname = qdict_get_try_str(*options, "driver");
   1936    if (drvname) {
   1937        drv = bdrv_find_format(drvname);
   1938        if (!drv) {
   1939            error_setg(errp, "Unknown driver '%s'", drvname);
   1940            return -ENOENT;
   1941        }
   1942        /* If the user has explicitly specified the driver, this choice should
   1943         * override the BDRV_O_PROTOCOL flag */
   1944        protocol = drv->bdrv_file_open;
   1945    }
   1946
   1947    if (protocol) {
   1948        *flags |= BDRV_O_PROTOCOL;
   1949    } else {
   1950        *flags &= ~BDRV_O_PROTOCOL;
   1951    }
   1952
   1953    /* Translate cache options from flags into options */
   1954    update_options_from_flags(*options, *flags);
   1955
   1956    /* Fetch the file name from the options QDict if necessary */
   1957    if (protocol && filename) {
   1958        if (!qdict_haskey(*options, "filename")) {
   1959            qdict_put_str(*options, "filename", filename);
   1960            parse_filename = true;
   1961        } else {
   1962            error_setg(errp, "Can't specify 'file' and 'filename' options at "
   1963                             "the same time");
   1964            return -EINVAL;
   1965        }
   1966    }
   1967
   1968    /* Find the right block driver */
   1969    /* See cautionary note on accessing @options above */
   1970    filename = qdict_get_try_str(*options, "filename");
   1971
   1972    if (!drvname && protocol) {
   1973        if (filename) {
   1974            drv = bdrv_find_protocol(filename, parse_filename, errp);
   1975            if (!drv) {
   1976                return -EINVAL;
   1977            }
   1978
   1979            drvname = drv->format_name;
   1980            qdict_put_str(*options, "driver", drvname);
   1981        } else {
   1982            error_setg(errp, "Must specify either driver or file");
   1983            return -EINVAL;
   1984        }
   1985    }
   1986
   1987    assert(drv || !protocol);
   1988
   1989    /* Driver-specific filename parsing */
   1990    if (drv && drv->bdrv_parse_filename && parse_filename) {
   1991        drv->bdrv_parse_filename(filename, *options, &local_err);
   1992        if (local_err) {
   1993            error_propagate(errp, local_err);
   1994            return -EINVAL;
   1995        }
   1996
   1997        if (!drv->bdrv_needs_filename) {
   1998            qdict_del(*options, "filename");
   1999        }
   2000    }
   2001
   2002    return 0;
   2003}
   2004
   2005typedef struct BlockReopenQueueEntry {
   2006     bool prepared;
   2007     bool perms_checked;
   2008     BDRVReopenState state;
   2009     QTAILQ_ENTRY(BlockReopenQueueEntry) entry;
   2010} BlockReopenQueueEntry;
   2011
   2012/*
   2013 * Return the flags that @bs will have after the reopens in @q have
   2014 * successfully completed. If @q is NULL (or @bs is not contained in @q),
   2015 * return the current flags.
   2016 */
   2017static int bdrv_reopen_get_flags(BlockReopenQueue *q, BlockDriverState *bs)
   2018{
   2019    BlockReopenQueueEntry *entry;
   2020
   2021    if (q != NULL) {
   2022        QTAILQ_FOREACH(entry, q, entry) {
   2023            if (entry->state.bs == bs) {
   2024                return entry->state.flags;
   2025            }
   2026        }
   2027    }
   2028
   2029    return bs->open_flags;
   2030}
   2031
   2032/* Returns whether the image file can be written to after the reopen queue @q
   2033 * has been successfully applied, or right now if @q is NULL. */
   2034static bool bdrv_is_writable_after_reopen(BlockDriverState *bs,
   2035                                          BlockReopenQueue *q)
   2036{
   2037    int flags = bdrv_reopen_get_flags(q, bs);
   2038
   2039    return (flags & (BDRV_O_RDWR | BDRV_O_INACTIVE)) == BDRV_O_RDWR;
   2040}
   2041
   2042/*
   2043 * Return whether the BDS can be written to.  This is not necessarily
   2044 * the same as !bdrv_is_read_only(bs), as inactivated images may not
   2045 * be written to but do not count as read-only images.
   2046 */
   2047bool bdrv_is_writable(BlockDriverState *bs)
   2048{
   2049    return bdrv_is_writable_after_reopen(bs, NULL);
   2050}
   2051
   2052static char *bdrv_child_user_desc(BdrvChild *c)
   2053{
   2054    return c->klass->get_parent_desc(c);
   2055}
   2056
   2057/*
   2058 * Check that @a allows everything that @b needs. @a and @b must reference same
   2059 * child node.
   2060 */
   2061static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, Error **errp)
   2062{
   2063    const char *child_bs_name;
   2064    g_autofree char *a_user = NULL;
   2065    g_autofree char *b_user = NULL;
   2066    g_autofree char *perms = NULL;
   2067
   2068    assert(a->bs);
   2069    assert(a->bs == b->bs);
   2070
   2071    if ((b->perm & a->shared_perm) == b->perm) {
   2072        return true;
   2073    }
   2074
   2075    child_bs_name = bdrv_get_node_name(b->bs);
   2076    a_user = bdrv_child_user_desc(a);
   2077    b_user = bdrv_child_user_desc(b);
   2078    perms = bdrv_perm_names(b->perm & ~a->shared_perm);
   2079
   2080    error_setg(errp, "Permission conflict on node '%s': permissions '%s' are "
   2081               "both required by %s (uses node '%s' as '%s' child) and "
   2082               "unshared by %s (uses node '%s' as '%s' child).",
   2083               child_bs_name, perms,
   2084               b_user, child_bs_name, b->name,
   2085               a_user, child_bs_name, a->name);
   2086
   2087    return false;
   2088}
   2089
   2090static bool bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
   2091{
   2092    BdrvChild *a, *b;
   2093
   2094    /*
   2095     * During the loop we'll look at each pair twice. That's correct because
   2096     * bdrv_a_allow_b() is asymmetric and we should check each pair in both
   2097     * directions.
   2098     */
   2099    QLIST_FOREACH(a, &bs->parents, next_parent) {
   2100        QLIST_FOREACH(b, &bs->parents, next_parent) {
   2101            if (a == b) {
   2102                continue;
   2103            }
   2104
   2105            if (!bdrv_a_allow_b(a, b, errp)) {
   2106                return true;
   2107            }
   2108        }
   2109    }
   2110
   2111    return false;
   2112}
   2113
   2114static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
   2115                            BdrvChild *c, BdrvChildRole role,
   2116                            BlockReopenQueue *reopen_queue,
   2117                            uint64_t parent_perm, uint64_t parent_shared,
   2118                            uint64_t *nperm, uint64_t *nshared)
   2119{
   2120    assert(bs->drv && bs->drv->bdrv_child_perm);
   2121    bs->drv->bdrv_child_perm(bs, c, role, reopen_queue,
   2122                             parent_perm, parent_shared,
   2123                             nperm, nshared);
   2124    /* TODO Take force_share from reopen_queue */
   2125    if (child_bs && child_bs->force_share) {
   2126        *nshared = BLK_PERM_ALL;
   2127    }
   2128}
   2129
   2130/*
   2131 * Adds the whole subtree of @bs (including @bs itself) to the @list (except for
   2132 * nodes that are already in the @list, of course) so that final list is
   2133 * topologically sorted. Return the result (GSList @list object is updated, so
   2134 * don't use old reference after function call).
   2135 *
   2136 * On function start @list must be already topologically sorted and for any node
   2137 * in the @list the whole subtree of the node must be in the @list as well. The
   2138 * simplest way to satisfy this criteria: use only result of
   2139 * bdrv_topological_dfs() or NULL as @list parameter.
   2140 */
   2141static GSList *bdrv_topological_dfs(GSList *list, GHashTable *found,
   2142                                    BlockDriverState *bs)
   2143{
   2144    BdrvChild *child;
   2145    g_autoptr(GHashTable) local_found = NULL;
   2146
   2147    if (!found) {
   2148        assert(!list);
   2149        found = local_found = g_hash_table_new(NULL, NULL);
   2150    }
   2151
   2152    if (g_hash_table_contains(found, bs)) {
   2153        return list;
   2154    }
   2155    g_hash_table_add(found, bs);
   2156
   2157    QLIST_FOREACH(child, &bs->children, next) {
   2158        list = bdrv_topological_dfs(list, found, child->bs);
   2159    }
   2160
   2161    return g_slist_prepend(list, bs);
   2162}
   2163
   2164typedef struct BdrvChildSetPermState {
   2165    BdrvChild *child;
   2166    uint64_t old_perm;
   2167    uint64_t old_shared_perm;
   2168} BdrvChildSetPermState;
   2169
   2170static void bdrv_child_set_perm_abort(void *opaque)
   2171{
   2172    BdrvChildSetPermState *s = opaque;
   2173
   2174    s->child->perm = s->old_perm;
   2175    s->child->shared_perm = s->old_shared_perm;
   2176}
   2177
   2178static TransactionActionDrv bdrv_child_set_pem_drv = {
   2179    .abort = bdrv_child_set_perm_abort,
   2180    .clean = g_free,
   2181};
   2182
   2183static void bdrv_child_set_perm(BdrvChild *c, uint64_t perm,
   2184                                uint64_t shared, Transaction *tran)
   2185{
   2186    BdrvChildSetPermState *s = g_new(BdrvChildSetPermState, 1);
   2187
   2188    *s = (BdrvChildSetPermState) {
   2189        .child = c,
   2190        .old_perm = c->perm,
   2191        .old_shared_perm = c->shared_perm,
   2192    };
   2193
   2194    c->perm = perm;
   2195    c->shared_perm = shared;
   2196
   2197    tran_add(tran, &bdrv_child_set_pem_drv, s);
   2198}
   2199
   2200static void bdrv_drv_set_perm_commit(void *opaque)
   2201{
   2202    BlockDriverState *bs = opaque;
   2203    uint64_t cumulative_perms, cumulative_shared_perms;
   2204
   2205    if (bs->drv->bdrv_set_perm) {
   2206        bdrv_get_cumulative_perm(bs, &cumulative_perms,
   2207                                 &cumulative_shared_perms);
   2208        bs->drv->bdrv_set_perm(bs, cumulative_perms, cumulative_shared_perms);
   2209    }
   2210}
   2211
   2212static void bdrv_drv_set_perm_abort(void *opaque)
   2213{
   2214    BlockDriverState *bs = opaque;
   2215
   2216    if (bs->drv->bdrv_abort_perm_update) {
   2217        bs->drv->bdrv_abort_perm_update(bs);
   2218    }
   2219}
   2220
   2221TransactionActionDrv bdrv_drv_set_perm_drv = {
   2222    .abort = bdrv_drv_set_perm_abort,
   2223    .commit = bdrv_drv_set_perm_commit,
   2224};
   2225
   2226static int bdrv_drv_set_perm(BlockDriverState *bs, uint64_t perm,
   2227                             uint64_t shared_perm, Transaction *tran,
   2228                             Error **errp)
   2229{
   2230    if (!bs->drv) {
   2231        return 0;
   2232    }
   2233
   2234    if (bs->drv->bdrv_check_perm) {
   2235        int ret = bs->drv->bdrv_check_perm(bs, perm, shared_perm, errp);
   2236        if (ret < 0) {
   2237            return ret;
   2238        }
   2239    }
   2240
   2241    if (tran) {
   2242        tran_add(tran, &bdrv_drv_set_perm_drv, bs);
   2243    }
   2244
   2245    return 0;
   2246}
   2247
   2248typedef struct BdrvReplaceChildState {
   2249    BdrvChild *child;
   2250    BlockDriverState *old_bs;
   2251} BdrvReplaceChildState;
   2252
   2253static void bdrv_replace_child_commit(void *opaque)
   2254{
   2255    BdrvReplaceChildState *s = opaque;
   2256
   2257    bdrv_unref(s->old_bs);
   2258}
   2259
   2260static void bdrv_replace_child_abort(void *opaque)
   2261{
   2262    BdrvReplaceChildState *s = opaque;
   2263    BlockDriverState *new_bs = s->child->bs;
   2264
   2265    /* old_bs reference is transparently moved from @s to @s->child */
   2266    bdrv_replace_child_noperm(s->child, s->old_bs);
   2267    bdrv_unref(new_bs);
   2268}
   2269
   2270static TransactionActionDrv bdrv_replace_child_drv = {
   2271    .commit = bdrv_replace_child_commit,
   2272    .abort = bdrv_replace_child_abort,
   2273    .clean = g_free,
   2274};
   2275
   2276/*
   2277 * bdrv_replace_child_tran
   2278 *
   2279 * Note: real unref of old_bs is done only on commit.
   2280 *
   2281 * The function doesn't update permissions, caller is responsible for this.
   2282 */
   2283static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
   2284                                    Transaction *tran)
   2285{
   2286    BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
   2287    *s = (BdrvReplaceChildState) {
   2288        .child = child,
   2289        .old_bs = child->bs,
   2290    };
   2291    tran_add(tran, &bdrv_replace_child_drv, s);
   2292
   2293    if (new_bs) {
   2294        bdrv_ref(new_bs);
   2295    }
   2296    bdrv_replace_child_noperm(child, new_bs);
   2297    /* old_bs reference is transparently moved from @child to @s */
   2298}
   2299
   2300/*
   2301 * Refresh permissions in @bs subtree. The function is intended to be called
   2302 * after some graph modification that was done without permission update.
   2303 */
   2304static int bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
   2305                                  Transaction *tran, Error **errp)
   2306{
   2307    BlockDriver *drv = bs->drv;
   2308    BdrvChild *c;
   2309    int ret;
   2310    uint64_t cumulative_perms, cumulative_shared_perms;
   2311
   2312    bdrv_get_cumulative_perm(bs, &cumulative_perms, &cumulative_shared_perms);
   2313
   2314    /* Write permissions never work with read-only images */
   2315    if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
   2316        !bdrv_is_writable_after_reopen(bs, q))
   2317    {
   2318        if (!bdrv_is_writable_after_reopen(bs, NULL)) {
   2319            error_setg(errp, "Block node is read-only");
   2320        } else {
   2321            error_setg(errp, "Read-only block node '%s' cannot support "
   2322                       "read-write users", bdrv_get_node_name(bs));
   2323        }
   2324
   2325        return -EPERM;
   2326    }
   2327
   2328    /*
   2329     * Unaligned requests will automatically be aligned to bl.request_alignment
   2330     * and without RESIZE we can't extend requests to write to space beyond the
   2331     * end of the image, so it's required that the image size is aligned.
   2332     */
   2333    if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
   2334        !(cumulative_perms & BLK_PERM_RESIZE))
   2335    {
   2336        if ((bs->total_sectors * BDRV_SECTOR_SIZE) % bs->bl.request_alignment) {
   2337            error_setg(errp, "Cannot get 'write' permission without 'resize': "
   2338                             "Image size is not a multiple of request "
   2339                             "alignment");
   2340            return -EPERM;
   2341        }
   2342    }
   2343
   2344    /* Check this node */
   2345    if (!drv) {
   2346        return 0;
   2347    }
   2348
   2349    ret = bdrv_drv_set_perm(bs, cumulative_perms, cumulative_shared_perms, tran,
   2350                            errp);
   2351    if (ret < 0) {
   2352        return ret;
   2353    }
   2354
   2355    /* Drivers that never have children can omit .bdrv_child_perm() */
   2356    if (!drv->bdrv_child_perm) {
   2357        assert(QLIST_EMPTY(&bs->children));
   2358        return 0;
   2359    }
   2360
   2361    /* Check all children */
   2362    QLIST_FOREACH(c, &bs->children, next) {
   2363        uint64_t cur_perm, cur_shared;
   2364
   2365        bdrv_child_perm(bs, c->bs, c, c->role, q,
   2366                        cumulative_perms, cumulative_shared_perms,
   2367                        &cur_perm, &cur_shared);
   2368        bdrv_child_set_perm(c, cur_perm, cur_shared, tran);
   2369    }
   2370
   2371    return 0;
   2372}
   2373
   2374static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q,
   2375                                   Transaction *tran, Error **errp)
   2376{
   2377    int ret;
   2378    BlockDriverState *bs;
   2379
   2380    for ( ; list; list = list->next) {
   2381        bs = list->data;
   2382
   2383        if (bdrv_parent_perms_conflict(bs, errp)) {
   2384            return -EINVAL;
   2385        }
   2386
   2387        ret = bdrv_node_refresh_perm(bs, q, tran, errp);
   2388        if (ret < 0) {
   2389            return ret;
   2390        }
   2391    }
   2392
   2393    return 0;
   2394}
   2395
   2396void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm,
   2397                              uint64_t *shared_perm)
   2398{
   2399    BdrvChild *c;
   2400    uint64_t cumulative_perms = 0;
   2401    uint64_t cumulative_shared_perms = BLK_PERM_ALL;
   2402
   2403    QLIST_FOREACH(c, &bs->parents, next_parent) {
   2404        cumulative_perms |= c->perm;
   2405        cumulative_shared_perms &= c->shared_perm;
   2406    }
   2407
   2408    *perm = cumulative_perms;
   2409    *shared_perm = cumulative_shared_perms;
   2410}
   2411
   2412char *bdrv_perm_names(uint64_t perm)
   2413{
   2414    struct perm_name {
   2415        uint64_t perm;
   2416        const char *name;
   2417    } permissions[] = {
   2418        { BLK_PERM_CONSISTENT_READ, "consistent read" },
   2419        { BLK_PERM_WRITE,           "write" },
   2420        { BLK_PERM_WRITE_UNCHANGED, "write unchanged" },
   2421        { BLK_PERM_RESIZE,          "resize" },
   2422        { BLK_PERM_GRAPH_MOD,       "change children" },
   2423        { 0, NULL }
   2424    };
   2425
   2426    GString *result = g_string_sized_new(30);
   2427    struct perm_name *p;
   2428
   2429    for (p = permissions; p->name; p++) {
   2430        if (perm & p->perm) {
   2431            if (result->len > 0) {
   2432                g_string_append(result, ", ");
   2433            }
   2434            g_string_append(result, p->name);
   2435        }
   2436    }
   2437
   2438    return g_string_free(result, FALSE);
   2439}
   2440
   2441
   2442static int bdrv_refresh_perms(BlockDriverState *bs, Error **errp)
   2443{
   2444    int ret;
   2445    Transaction *tran = tran_new();
   2446    g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, bs);
   2447
   2448    ret = bdrv_list_refresh_perms(list, NULL, tran, errp);
   2449    tran_finalize(tran, ret);
   2450
   2451    return ret;
   2452}
   2453
   2454int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
   2455                            Error **errp)
   2456{
   2457    Error *local_err = NULL;
   2458    Transaction *tran = tran_new();
   2459    int ret;
   2460
   2461    bdrv_child_set_perm(c, perm, shared, tran);
   2462
   2463    ret = bdrv_refresh_perms(c->bs, &local_err);
   2464
   2465    tran_finalize(tran, ret);
   2466
   2467    if (ret < 0) {
   2468        if ((perm & ~c->perm) || (c->shared_perm & ~shared)) {
   2469            /* tighten permissions */
   2470            error_propagate(errp, local_err);
   2471        } else {
   2472            /*
   2473             * Our caller may intend to only loosen restrictions and
   2474             * does not expect this function to fail.  Errors are not
   2475             * fatal in such a case, so we can just hide them from our
   2476             * caller.
   2477             */
   2478            error_free(local_err);
   2479            ret = 0;
   2480        }
   2481    }
   2482
   2483    return ret;
   2484}
   2485
   2486int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp)
   2487{
   2488    uint64_t parent_perms, parent_shared;
   2489    uint64_t perms, shared;
   2490
   2491    bdrv_get_cumulative_perm(bs, &parent_perms, &parent_shared);
   2492    bdrv_child_perm(bs, c->bs, c, c->role, NULL,
   2493                    parent_perms, parent_shared, &perms, &shared);
   2494
   2495    return bdrv_child_try_set_perm(c, perms, shared, errp);
   2496}
   2497
   2498/*
   2499 * Default implementation for .bdrv_child_perm() for block filters:
   2500 * Forward CONSISTENT_READ, WRITE, WRITE_UNCHANGED, and RESIZE to the
   2501 * filtered child.
   2502 */
   2503static void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,
   2504                                      BdrvChildRole role,
   2505                                      BlockReopenQueue *reopen_queue,
   2506                                      uint64_t perm, uint64_t shared,
   2507                                      uint64_t *nperm, uint64_t *nshared)
   2508{
   2509    *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
   2510    *nshared = (shared & DEFAULT_PERM_PASSTHROUGH) | DEFAULT_PERM_UNCHANGED;
   2511}
   2512
   2513static void bdrv_default_perms_for_cow(BlockDriverState *bs, BdrvChild *c,
   2514                                       BdrvChildRole role,
   2515                                       BlockReopenQueue *reopen_queue,
   2516                                       uint64_t perm, uint64_t shared,
   2517                                       uint64_t *nperm, uint64_t *nshared)
   2518{
   2519    assert(role & BDRV_CHILD_COW);
   2520
   2521    /*
   2522     * We want consistent read from backing files if the parent needs it.
   2523     * No other operations are performed on backing files.
   2524     */
   2525    perm &= BLK_PERM_CONSISTENT_READ;
   2526
   2527    /*
   2528     * If the parent can deal with changing data, we're okay with a
   2529     * writable and resizable backing file.
   2530     * TODO Require !(perm & BLK_PERM_CONSISTENT_READ), too?
   2531     */
   2532    if (shared & BLK_PERM_WRITE) {
   2533        shared = BLK_PERM_WRITE | BLK_PERM_RESIZE;
   2534    } else {
   2535        shared = 0;
   2536    }
   2537
   2538    shared |= BLK_PERM_CONSISTENT_READ | BLK_PERM_GRAPH_MOD |
   2539              BLK_PERM_WRITE_UNCHANGED;
   2540
   2541    if (bs->open_flags & BDRV_O_INACTIVE) {
   2542        shared |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
   2543    }
   2544
   2545    *nperm = perm;
   2546    *nshared = shared;
   2547}
   2548
   2549static void bdrv_default_perms_for_storage(BlockDriverState *bs, BdrvChild *c,
   2550                                           BdrvChildRole role,
   2551                                           BlockReopenQueue *reopen_queue,
   2552                                           uint64_t perm, uint64_t shared,
   2553                                           uint64_t *nperm, uint64_t *nshared)
   2554{
   2555    int flags;
   2556
   2557    assert(role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA));
   2558
   2559    flags = bdrv_reopen_get_flags(reopen_queue, bs);
   2560
   2561    /*
   2562     * Apart from the modifications below, the same permissions are
   2563     * forwarded and left alone as for filters
   2564     */
   2565    bdrv_filter_default_perms(bs, c, role, reopen_queue,
   2566                              perm, shared, &perm, &shared);
   2567
   2568    if (role & BDRV_CHILD_METADATA) {
   2569        /* Format drivers may touch metadata even if the guest doesn't write */
   2570        if (bdrv_is_writable_after_reopen(bs, reopen_queue)) {
   2571            perm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
   2572        }
   2573
   2574        /*
   2575         * bs->file always needs to be consistent because of the
   2576         * metadata. We can never allow other users to resize or write
   2577         * to it.
   2578         */
   2579        if (!(flags & BDRV_O_NO_IO)) {
   2580            perm |= BLK_PERM_CONSISTENT_READ;
   2581        }
   2582        shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
   2583    }
   2584
   2585    if (role & BDRV_CHILD_DATA) {
   2586        /*
   2587         * Technically, everything in this block is a subset of the
   2588         * BDRV_CHILD_METADATA path taken above, and so this could
   2589         * be an "else if" branch.  However, that is not obvious, and
   2590         * this function is not performance critical, therefore we let
   2591         * this be an independent "if".
   2592         */
   2593
   2594        /*
   2595         * We cannot allow other users to resize the file because the
   2596         * format driver might have some assumptions about the size
   2597         * (e.g. because it is stored in metadata, or because the file
   2598         * is split into fixed-size data files).
   2599         */
   2600        shared &= ~BLK_PERM_RESIZE;
   2601
   2602        /*
   2603         * WRITE_UNCHANGED often cannot be performed as such on the
   2604         * data file.  For example, the qcow2 driver may still need to
   2605         * write copied clusters on copy-on-read.
   2606         */
   2607        if (perm & BLK_PERM_WRITE_UNCHANGED) {
   2608            perm |= BLK_PERM_WRITE;
   2609        }
   2610
   2611        /*
   2612         * If the data file is written to, the format driver may
   2613         * expect to be able to resize it by writing beyond the EOF.
   2614         */
   2615        if (perm & BLK_PERM_WRITE) {
   2616            perm |= BLK_PERM_RESIZE;
   2617        }
   2618    }
   2619
   2620    if (bs->open_flags & BDRV_O_INACTIVE) {
   2621        shared |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
   2622    }
   2623
   2624    *nperm = perm;
   2625    *nshared = shared;
   2626}
   2627
   2628void bdrv_default_perms(BlockDriverState *bs, BdrvChild *c,
   2629                        BdrvChildRole role, BlockReopenQueue *reopen_queue,
   2630                        uint64_t perm, uint64_t shared,
   2631                        uint64_t *nperm, uint64_t *nshared)
   2632{
   2633    if (role & BDRV_CHILD_FILTERED) {
   2634        assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
   2635                         BDRV_CHILD_COW)));
   2636        bdrv_filter_default_perms(bs, c, role, reopen_queue,
   2637                                  perm, shared, nperm, nshared);
   2638    } else if (role & BDRV_CHILD_COW) {
   2639        assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA)));
   2640        bdrv_default_perms_for_cow(bs, c, role, reopen_queue,
   2641                                   perm, shared, nperm, nshared);
   2642    } else if (role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA)) {
   2643        bdrv_default_perms_for_storage(bs, c, role, reopen_queue,
   2644                                       perm, shared, nperm, nshared);
   2645    } else {
   2646        g_assert_not_reached();
   2647    }
   2648}
   2649
   2650uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm)
   2651{
   2652    static const uint64_t permissions[] = {
   2653        [BLOCK_PERMISSION_CONSISTENT_READ]  = BLK_PERM_CONSISTENT_READ,
   2654        [BLOCK_PERMISSION_WRITE]            = BLK_PERM_WRITE,
   2655        [BLOCK_PERMISSION_WRITE_UNCHANGED]  = BLK_PERM_WRITE_UNCHANGED,
   2656        [BLOCK_PERMISSION_RESIZE]           = BLK_PERM_RESIZE,
   2657        [BLOCK_PERMISSION_GRAPH_MOD]        = BLK_PERM_GRAPH_MOD,
   2658    };
   2659
   2660    QEMU_BUILD_BUG_ON(ARRAY_SIZE(permissions) != BLOCK_PERMISSION__MAX);
   2661    QEMU_BUILD_BUG_ON(1UL << ARRAY_SIZE(permissions) != BLK_PERM_ALL + 1);
   2662
   2663    assert(qapi_perm < BLOCK_PERMISSION__MAX);
   2664
   2665    return permissions[qapi_perm];
   2666}
   2667
   2668static void bdrv_replace_child_noperm(BdrvChild *child,
   2669                                      BlockDriverState *new_bs)
   2670{
   2671    BlockDriverState *old_bs = child->bs;
   2672    int new_bs_quiesce_counter;
   2673    int drain_saldo;
   2674
   2675    assert(!child->frozen);
   2676
   2677    if (old_bs && new_bs) {
   2678        assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
   2679    }
   2680
   2681    new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
   2682    drain_saldo = new_bs_quiesce_counter - child->parent_quiesce_counter;
   2683
   2684    /*
   2685     * If the new child node is drained but the old one was not, flush
   2686     * all outstanding requests to the old child node.
   2687     */
   2688    while (drain_saldo > 0 && child->klass->drained_begin) {
   2689        bdrv_parent_drained_begin_single(child, true);
   2690        drain_saldo--;
   2691    }
   2692
   2693    if (old_bs) {
   2694        /* Detach first so that the recursive drain sections coming from @child
   2695         * are already gone and we only end the drain sections that came from
   2696         * elsewhere. */
   2697        if (child->klass->detach) {
   2698            child->klass->detach(child);
   2699        }
   2700        QLIST_REMOVE(child, next_parent);
   2701    }
   2702
   2703    child->bs = new_bs;
   2704
   2705    if (new_bs) {
   2706        QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
   2707
   2708        /*
   2709         * Detaching the old node may have led to the new node's
   2710         * quiesce_counter having been decreased.  Not a problem, we
   2711         * just need to recognize this here and then invoke
   2712         * drained_end appropriately more often.
   2713         */
   2714        assert(new_bs->quiesce_counter <= new_bs_quiesce_counter);
   2715        drain_saldo += new_bs->quiesce_counter - new_bs_quiesce_counter;
   2716
   2717        /* Attach only after starting new drained sections, so that recursive
   2718         * drain sections coming from @child don't get an extra .drained_begin
   2719         * callback. */
   2720        if (child->klass->attach) {
   2721            child->klass->attach(child);
   2722        }
   2723    }
   2724
   2725    /*
   2726     * If the old child node was drained but the new one is not, allow
   2727     * requests to come in only after the new node has been attached.
   2728     */
   2729    while (drain_saldo < 0 && child->klass->drained_end) {
   2730        bdrv_parent_drained_end_single(child);
   2731        drain_saldo++;
   2732    }
   2733}
   2734
   2735static void bdrv_child_free(void *opaque)
   2736{
   2737    BdrvChild *c = opaque;
   2738
   2739    g_free(c->name);
   2740    g_free(c);
   2741}
   2742
   2743static void bdrv_remove_empty_child(BdrvChild *child)
   2744{
   2745    assert(!child->bs);
   2746    QLIST_SAFE_REMOVE(child, next);
   2747    bdrv_child_free(child);
   2748}
   2749
   2750typedef struct BdrvAttachChildCommonState {
   2751    BdrvChild **child;
   2752    AioContext *old_parent_ctx;
   2753    AioContext *old_child_ctx;
   2754} BdrvAttachChildCommonState;
   2755
   2756static void bdrv_attach_child_common_abort(void *opaque)
   2757{
   2758    BdrvAttachChildCommonState *s = opaque;
   2759    BdrvChild *child = *s->child;
   2760    BlockDriverState *bs = child->bs;
   2761
   2762    bdrv_replace_child_noperm(child, NULL);
   2763
   2764    if (bdrv_get_aio_context(bs) != s->old_child_ctx) {
   2765        bdrv_try_set_aio_context(bs, s->old_child_ctx, &error_abort);
   2766    }
   2767
   2768    if (bdrv_child_get_parent_aio_context(child) != s->old_parent_ctx) {
   2769        GSList *ignore = g_slist_prepend(NULL, child);
   2770
   2771        child->klass->can_set_aio_ctx(child, s->old_parent_ctx, &ignore,
   2772                                      &error_abort);
   2773        g_slist_free(ignore);
   2774        ignore = g_slist_prepend(NULL, child);
   2775        child->klass->set_aio_ctx(child, s->old_parent_ctx, &ignore);
   2776
   2777        g_slist_free(ignore);
   2778    }
   2779
   2780    bdrv_unref(bs);
   2781    bdrv_remove_empty_child(child);
   2782    *s->child = NULL;
   2783}
   2784
   2785static TransactionActionDrv bdrv_attach_child_common_drv = {
   2786    .abort = bdrv_attach_child_common_abort,
   2787    .clean = g_free,
   2788};
   2789
   2790/*
   2791 * Common part of attaching bdrv child to bs or to blk or to job
   2792 *
   2793 * Resulting new child is returned through @child.
   2794 * At start *@child must be NULL.
   2795 * @child is saved to a new entry of @tran, so that *@child could be reverted to
   2796 * NULL on abort(). So referenced variable must live at least until transaction
   2797 * end.
   2798 *
   2799 * Function doesn't update permissions, caller is responsible for this.
   2800 */
   2801static int bdrv_attach_child_common(BlockDriverState *child_bs,
   2802                                    const char *child_name,
   2803                                    const BdrvChildClass *child_class,
   2804                                    BdrvChildRole child_role,
   2805                                    uint64_t perm, uint64_t shared_perm,
   2806                                    void *opaque, BdrvChild **child,
   2807                                    Transaction *tran, Error **errp)
   2808{
   2809    BdrvChild *new_child;
   2810    AioContext *parent_ctx;
   2811    AioContext *child_ctx = bdrv_get_aio_context(child_bs);
   2812
   2813    assert(child);
   2814    assert(*child == NULL);
   2815    assert(child_class->get_parent_desc);
   2816
   2817    new_child = g_new(BdrvChild, 1);
   2818    *new_child = (BdrvChild) {
   2819        .bs             = NULL,
   2820        .name           = g_strdup(child_name),
   2821        .klass          = child_class,
   2822        .role           = child_role,
   2823        .perm           = perm,
   2824        .shared_perm    = shared_perm,
   2825        .opaque         = opaque,
   2826    };
   2827
   2828    /*
   2829     * If the AioContexts don't match, first try to move the subtree of
   2830     * child_bs into the AioContext of the new parent. If this doesn't work,
   2831     * try moving the parent into the AioContext of child_bs instead.
   2832     */
   2833    parent_ctx = bdrv_child_get_parent_aio_context(new_child);
   2834    if (child_ctx != parent_ctx) {
   2835        Error *local_err = NULL;
   2836        int ret = bdrv_try_set_aio_context(child_bs, parent_ctx, &local_err);
   2837
   2838        if (ret < 0 && child_class->can_set_aio_ctx) {
   2839            GSList *ignore = g_slist_prepend(NULL, new_child);
   2840            if (child_class->can_set_aio_ctx(new_child, child_ctx, &ignore,
   2841                                             NULL))
   2842            {
   2843                error_free(local_err);
   2844                ret = 0;
   2845                g_slist_free(ignore);
   2846                ignore = g_slist_prepend(NULL, new_child);
   2847                child_class->set_aio_ctx(new_child, child_ctx, &ignore);
   2848            }
   2849            g_slist_free(ignore);
   2850        }
   2851
   2852        if (ret < 0) {
   2853            error_propagate(errp, local_err);
   2854            bdrv_remove_empty_child(new_child);
   2855            return ret;
   2856        }
   2857    }
   2858
   2859    bdrv_ref(child_bs);
   2860    bdrv_replace_child_noperm(new_child, child_bs);
   2861
   2862    *child = new_child;
   2863
   2864    BdrvAttachChildCommonState *s = g_new(BdrvAttachChildCommonState, 1);
   2865    *s = (BdrvAttachChildCommonState) {
   2866        .child = child,
   2867        .old_parent_ctx = parent_ctx,
   2868        .old_child_ctx = child_ctx,
   2869    };
   2870    tran_add(tran, &bdrv_attach_child_common_drv, s);
   2871
   2872    return 0;
   2873}
   2874
   2875/*
   2876 * Variable referenced by @child must live at least until transaction end.
   2877 * (see bdrv_attach_child_common() doc for details)
   2878 *
   2879 * Function doesn't update permissions, caller is responsible for this.
   2880 */
   2881static int bdrv_attach_child_noperm(BlockDriverState *parent_bs,
   2882                                    BlockDriverState *child_bs,
   2883                                    const char *child_name,
   2884                                    const BdrvChildClass *child_class,
   2885                                    BdrvChildRole child_role,
   2886                                    BdrvChild **child,
   2887                                    Transaction *tran,
   2888                                    Error **errp)
   2889{
   2890    int ret;
   2891    uint64_t perm, shared_perm;
   2892
   2893    assert(parent_bs->drv);
   2894
   2895    bdrv_get_cumulative_perm(parent_bs, &perm, &shared_perm);
   2896    bdrv_child_perm(parent_bs, child_bs, NULL, child_role, NULL,
   2897                    perm, shared_perm, &perm, &shared_perm);
   2898
   2899    ret = bdrv_attach_child_common(child_bs, child_name, child_class,
   2900                                   child_role, perm, shared_perm, parent_bs,
   2901                                   child, tran, errp);
   2902    if (ret < 0) {
   2903        return ret;
   2904    }
   2905
   2906    QLIST_INSERT_HEAD(&parent_bs->children, *child, next);
   2907    /*
   2908     * child is removed in bdrv_attach_child_common_abort(), so don't care to
   2909     * abort this change separately.
   2910     */
   2911
   2912    return 0;
   2913}
   2914
   2915static void bdrv_detach_child(BdrvChild *child)
   2916{
   2917    BlockDriverState *old_bs = child->bs;
   2918
   2919    bdrv_replace_child_noperm(child, NULL);
   2920    bdrv_remove_empty_child(child);
   2921
   2922    if (old_bs) {
   2923        /*
   2924         * Update permissions for old node. We're just taking a parent away, so
   2925         * we're loosening restrictions. Errors of permission update are not
   2926         * fatal in this case, ignore them.
   2927         */
   2928        bdrv_refresh_perms(old_bs, NULL);
   2929
   2930        /*
   2931         * When the parent requiring a non-default AioContext is removed, the
   2932         * node moves back to the main AioContext
   2933         */
   2934        bdrv_try_set_aio_context(old_bs, qemu_get_aio_context(), NULL);
   2935    }
   2936}
   2937
   2938/*
   2939 * This function steals the reference to child_bs from the caller.
   2940 * That reference is later dropped by bdrv_root_unref_child().
   2941 *
   2942 * On failure NULL is returned, errp is set and the reference to
   2943 * child_bs is also dropped.
   2944 *
   2945 * The caller must hold the AioContext lock @child_bs, but not that of @ctx
   2946 * (unless @child_bs is already in @ctx).
   2947 */
   2948BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
   2949                                  const char *child_name,
   2950                                  const BdrvChildClass *child_class,
   2951                                  BdrvChildRole child_role,
   2952                                  uint64_t perm, uint64_t shared_perm,
   2953                                  void *opaque, Error **errp)
   2954{
   2955    int ret;
   2956    BdrvChild *child = NULL;
   2957    Transaction *tran = tran_new();
   2958
   2959    ret = bdrv_attach_child_common(child_bs, child_name, child_class,
   2960                                   child_role, perm, shared_perm, opaque,
   2961                                   &child, tran, errp);
   2962    if (ret < 0) {
   2963        goto out;
   2964    }
   2965
   2966    ret = bdrv_refresh_perms(child_bs, errp);
   2967
   2968out:
   2969    tran_finalize(tran, ret);
   2970    /* child is unset on failure by bdrv_attach_child_common_abort() */
   2971    assert((ret < 0) == !child);
   2972
   2973    bdrv_unref(child_bs);
   2974    return child;
   2975}
   2976
   2977/*
   2978 * This function transfers the reference to child_bs from the caller
   2979 * to parent_bs. That reference is later dropped by parent_bs on
   2980 * bdrv_close() or if someone calls bdrv_unref_child().
   2981 *
   2982 * On failure NULL is returned, errp is set and the reference to
   2983 * child_bs is also dropped.
   2984 *
   2985 * If @parent_bs and @child_bs are in different AioContexts, the caller must
   2986 * hold the AioContext lock for @child_bs, but not for @parent_bs.
   2987 */
   2988BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
   2989                             BlockDriverState *child_bs,
   2990                             const char *child_name,
   2991                             const BdrvChildClass *child_class,
   2992                             BdrvChildRole child_role,
   2993                             Error **errp)
   2994{
   2995    int ret;
   2996    BdrvChild *child = NULL;
   2997    Transaction *tran = tran_new();
   2998
   2999    ret = bdrv_attach_child_noperm(parent_bs, child_bs, child_name, child_class,
   3000                                   child_role, &child, tran, errp);
   3001    if (ret < 0) {
   3002        goto out;
   3003    }
   3004
   3005    ret = bdrv_refresh_perms(parent_bs, errp);
   3006    if (ret < 0) {
   3007        goto out;
   3008    }
   3009
   3010out:
   3011    tran_finalize(tran, ret);
   3012    /* child is unset on failure by bdrv_attach_child_common_abort() */
   3013    assert((ret < 0) == !child);
   3014
   3015    bdrv_unref(child_bs);
   3016
   3017    return child;
   3018}
   3019
   3020/* Callers must ensure that child->frozen is false. */
   3021void bdrv_root_unref_child(BdrvChild *child)
   3022{
   3023    BlockDriverState *child_bs;
   3024
   3025    child_bs = child->bs;
   3026    bdrv_detach_child(child);
   3027    bdrv_unref(child_bs);
   3028}
   3029
   3030typedef struct BdrvSetInheritsFrom {
   3031    BlockDriverState *bs;
   3032    BlockDriverState *old_inherits_from;
   3033} BdrvSetInheritsFrom;
   3034
   3035static void bdrv_set_inherits_from_abort(void *opaque)
   3036{
   3037    BdrvSetInheritsFrom *s = opaque;
   3038
   3039    s->bs->inherits_from = s->old_inherits_from;
   3040}
   3041
   3042static TransactionActionDrv bdrv_set_inherits_from_drv = {
   3043    .abort = bdrv_set_inherits_from_abort,
   3044    .clean = g_free,
   3045};
   3046
   3047/* @tran is allowed to be NULL. In this case no rollback is possible */
   3048static void bdrv_set_inherits_from(BlockDriverState *bs,
   3049                                   BlockDriverState *new_inherits_from,
   3050                                   Transaction *tran)
   3051{
   3052    if (tran) {
   3053        BdrvSetInheritsFrom *s = g_new(BdrvSetInheritsFrom, 1);
   3054
   3055        *s = (BdrvSetInheritsFrom) {
   3056            .bs = bs,
   3057            .old_inherits_from = bs->inherits_from,
   3058        };
   3059
   3060        tran_add(tran, &bdrv_set_inherits_from_drv, s);
   3061    }
   3062
   3063    bs->inherits_from = new_inherits_from;
   3064}
   3065
   3066/**
   3067 * Clear all inherits_from pointers from children and grandchildren of
   3068 * @root that point to @root, where necessary.
   3069 * @tran is allowed to be NULL. In this case no rollback is possible
   3070 */
   3071static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child,
   3072                                     Transaction *tran)
   3073{
   3074    BdrvChild *c;
   3075
   3076    if (child->bs->inherits_from == root) {
   3077        /*
   3078         * Remove inherits_from only when the last reference between root and
   3079         * child->bs goes away.
   3080         */
   3081        QLIST_FOREACH(c, &root->children, next) {
   3082            if (c != child && c->bs == child->bs) {
   3083                break;
   3084            }
   3085        }
   3086        if (c == NULL) {
   3087            bdrv_set_inherits_from(child->bs, NULL, tran);
   3088        }
   3089    }
   3090
   3091    QLIST_FOREACH(c, &child->bs->children, next) {
   3092        bdrv_unset_inherits_from(root, c, tran);
   3093    }
   3094}
   3095
   3096/* Callers must ensure that child->frozen is false. */
   3097void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
   3098{
   3099    if (child == NULL) {
   3100        return;
   3101    }
   3102
   3103    bdrv_unset_inherits_from(parent, child, NULL);
   3104    bdrv_root_unref_child(child);
   3105}
   3106
   3107
   3108static void bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
   3109{
   3110    BdrvChild *c;
   3111    QLIST_FOREACH(c, &bs->parents, next_parent) {
   3112        if (c->klass->change_media) {
   3113            c->klass->change_media(c, load);
   3114        }
   3115    }
   3116}
   3117
   3118/* Return true if you can reach parent going through child->inherits_from
   3119 * recursively. If parent or child are NULL, return false */
   3120static bool bdrv_inherits_from_recursive(BlockDriverState *child,
   3121                                         BlockDriverState *parent)
   3122{
   3123    while (child && child != parent) {
   3124        child = child->inherits_from;
   3125    }
   3126
   3127    return child != NULL;
   3128}
   3129
   3130/*
   3131 * Return the BdrvChildRole for @bs's backing child.  bs->backing is
   3132 * mostly used for COW backing children (role = COW), but also for
   3133 * filtered children (role = FILTERED | PRIMARY).
   3134 */
   3135static BdrvChildRole bdrv_backing_role(BlockDriverState *bs)
   3136{
   3137    if (bs->drv && bs->drv->is_filter) {
   3138        return BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY;
   3139    } else {
   3140        return BDRV_CHILD_COW;
   3141    }
   3142}
   3143
   3144/*
   3145 * Sets the bs->backing or bs->file link of a BDS. A new reference is created;
   3146 * callers which don't need their own reference any more must call bdrv_unref().
   3147 *
   3148 * Function doesn't update permissions, caller is responsible for this.
   3149 */
   3150static int bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
   3151                                           BlockDriverState *child_bs,
   3152                                           bool is_backing,
   3153                                           Transaction *tran, Error **errp)
   3154{
   3155    int ret = 0;
   3156    bool update_inherits_from =
   3157        bdrv_inherits_from_recursive(child_bs, parent_bs);
   3158    BdrvChild *child = is_backing ? parent_bs->backing : parent_bs->file;
   3159    BdrvChildRole role;
   3160
   3161    if (!parent_bs->drv) {
   3162        /*
   3163         * Node without drv is an object without a class :/. TODO: finally fix
   3164         * qcow2 driver to never clear bs->drv and implement format corruption
   3165         * handling in other way.
   3166         */
   3167        error_setg(errp, "Node corrupted");
   3168        return -EINVAL;
   3169    }
   3170
   3171    if (child && child->frozen) {
   3172        error_setg(errp, "Cannot change frozen '%s' link from '%s' to '%s'",
   3173                   child->name, parent_bs->node_name, child->bs->node_name);
   3174        return -EPERM;
   3175    }
   3176
   3177    if (is_backing && !parent_bs->drv->is_filter &&
   3178        !parent_bs->drv->supports_backing)
   3179    {
   3180        error_setg(errp, "Driver '%s' of node '%s' does not support backing "
   3181                   "files", parent_bs->drv->format_name, parent_bs->node_name);
   3182        return -EINVAL;
   3183    }
   3184
   3185    if (parent_bs->drv->is_filter) {
   3186        role = BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY;
   3187    } else if (is_backing) {
   3188        role = BDRV_CHILD_COW;
   3189    } else {
   3190        /*
   3191         * We only can use same role as it is in existing child. We don't have
   3192         * infrastructure to determine role of file child in generic way
   3193         */
   3194        if (!child) {
   3195            error_setg(errp, "Cannot set file child to format node without "
   3196                       "file child");
   3197            return -EINVAL;
   3198        }
   3199        role = child->role;
   3200    }
   3201
   3202    if (child) {
   3203        bdrv_unset_inherits_from(parent_bs, child, tran);
   3204        bdrv_remove_file_or_backing_child(parent_bs, child, tran);
   3205    }
   3206
   3207    if (!child_bs) {
   3208        goto out;
   3209    }
   3210
   3211    ret = bdrv_attach_child_noperm(parent_bs, child_bs,
   3212                                   is_backing ? "backing" : "file",
   3213                                   &child_of_bds, role,
   3214                                   is_backing ? &parent_bs->backing :
   3215                                                &parent_bs->file,
   3216                                   tran, errp);
   3217    if (ret < 0) {
   3218        return ret;
   3219    }
   3220
   3221
   3222    /*
   3223     * If inherits_from pointed recursively to bs then let's update it to
   3224     * point directly to bs (else it will become NULL).
   3225     */
   3226    if (update_inherits_from) {
   3227        bdrv_set_inherits_from(child_bs, parent_bs, tran);
   3228    }
   3229
   3230out:
   3231    bdrv_refresh_limits(parent_bs, tran, NULL);
   3232
   3233    return 0;
   3234}
   3235
   3236static int bdrv_set_backing_noperm(BlockDriverState *bs,
   3237                                   BlockDriverState *backing_hd,
   3238                                   Transaction *tran, Error **errp)
   3239{
   3240    return bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp);
   3241}
   3242
   3243int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
   3244                        Error **errp)
   3245{
   3246    int ret;
   3247    Transaction *tran = tran_new();
   3248
   3249    ret = bdrv_set_backing_noperm(bs, backing_hd, tran, errp);
   3250    if (ret < 0) {
   3251        goto out;
   3252    }
   3253
   3254    ret = bdrv_refresh_perms(bs, errp);
   3255out:
   3256    tran_finalize(tran, ret);
   3257
   3258    return ret;
   3259}
   3260
   3261/*
   3262 * Opens the backing file for a BlockDriverState if not yet open
   3263 *
   3264 * bdref_key specifies the key for the image's BlockdevRef in the options QDict.
   3265 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
   3266 * itself, all options starting with "${bdref_key}." are considered part of the
   3267 * BlockdevRef.
   3268 *
   3269 * TODO Can this be unified with bdrv_open_image()?
   3270 */
   3271int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
   3272                           const char *bdref_key, Error **errp)
   3273{
   3274    char *backing_filename = NULL;
   3275    char *bdref_key_dot;
   3276    const char *reference = NULL;
   3277    int ret = 0;
   3278    bool implicit_backing = false;
   3279    BlockDriverState *backing_hd;
   3280    QDict *options;
   3281    QDict *tmp_parent_options = NULL;
   3282    Error *local_err = NULL;
   3283
   3284    if (bs->backing != NULL) {
   3285        goto free_exit;
   3286    }
   3287
   3288    /* NULL means an empty set of options */
   3289    if (parent_options == NULL) {
   3290        tmp_parent_options = qdict_new();
   3291        parent_options = tmp_parent_options;
   3292    }
   3293
   3294    bs->open_flags &= ~BDRV_O_NO_BACKING;
   3295
   3296    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
   3297    qdict_extract_subqdict(parent_options, &options, bdref_key_dot);
   3298    g_free(bdref_key_dot);
   3299
   3300    /*
   3301     * Caution: while qdict_get_try_str() is fine, getting non-string
   3302     * types would require more care.  When @parent_options come from
   3303     * -blockdev or blockdev_add, its members are typed according to
   3304     * the QAPI schema, but when they come from -drive, they're all
   3305     * QString.
   3306     */
   3307    reference = qdict_get_try_str(parent_options, bdref_key);
   3308    if (reference || qdict_haskey(options, "file.filename")) {
   3309        /* keep backing_filename NULL */
   3310    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
   3311        qobject_unref(options);
   3312        goto free_exit;
   3313    } else {
   3314        if (qdict_size(options) == 0) {
   3315            /* If the user specifies options that do not modify the
   3316             * backing file's behavior, we might still consider it the
   3317             * implicit backing file.  But it's easier this way, and
   3318             * just specifying some of the backing BDS's options is
   3319             * only possible with -drive anyway (otherwise the QAPI
   3320             * schema forces the user to specify everything). */
   3321            implicit_backing = !strcmp(bs->auto_backing_file, bs->backing_file);
   3322        }
   3323
   3324        backing_filename = bdrv_get_full_backing_filename(bs, &local_err);
   3325        if (local_err) {
   3326            ret = -EINVAL;
   3327            error_propagate(errp, local_err);
   3328            qobject_unref(options);
   3329            goto free_exit;
   3330        }
   3331    }
   3332
   3333    if (!bs->drv || !bs->drv->supports_backing) {
   3334        ret = -EINVAL;
   3335        error_setg(errp, "Driver doesn't support backing files");
   3336        qobject_unref(options);
   3337        goto free_exit;
   3338    }
   3339
   3340    if (!reference &&
   3341        bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
   3342        qdict_put_str(options, "driver", bs->backing_format);
   3343    }
   3344
   3345    backing_hd = bdrv_open_inherit(backing_filename, reference, options, 0, bs,
   3346                                   &child_of_bds, bdrv_backing_role(bs), errp);
   3347    if (!backing_hd) {
   3348        bs->open_flags |= BDRV_O_NO_BACKING;
   3349        error_prepend(errp, "Could not open backing file: ");
   3350        ret = -EINVAL;
   3351        goto free_exit;
   3352    }
   3353
   3354    if (implicit_backing) {
   3355        bdrv_refresh_filename(backing_hd);
   3356        pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
   3357                backing_hd->filename);
   3358    }
   3359
   3360    /* Hook up the backing file link; drop our reference, bs owns the
   3361     * backing_hd reference now */
   3362    ret = bdrv_set_backing_hd(bs, backing_hd, errp);
   3363    bdrv_unref(backing_hd);
   3364    if (ret < 0) {
   3365        goto free_exit;
   3366    }
   3367
   3368    qdict_del(parent_options, bdref_key);
   3369
   3370free_exit:
   3371    g_free(backing_filename);
   3372    qobject_unref(tmp_parent_options);
   3373    return ret;
   3374}
   3375
   3376static BlockDriverState *
   3377bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key,
   3378                   BlockDriverState *parent, const BdrvChildClass *child_class,
   3379                   BdrvChildRole child_role, bool allow_none, Error **errp)
   3380{
   3381    BlockDriverState *bs = NULL;
   3382    QDict *image_options;
   3383    char *bdref_key_dot;
   3384    const char *reference;
   3385
   3386    assert(child_class != NULL);
   3387
   3388    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
   3389    qdict_extract_subqdict(options, &image_options, bdref_key_dot);
   3390    g_free(bdref_key_dot);
   3391
   3392    /*
   3393     * Caution: while qdict_get_try_str() is fine, getting non-string
   3394     * types would require more care.  When @options come from
   3395     * -blockdev or blockdev_add, its members are typed according to
   3396     * the QAPI schema, but when they come from -drive, they're all
   3397     * QString.
   3398     */
   3399    reference = qdict_get_try_str(options, bdref_key);
   3400    if (!filename && !reference && !qdict_size(image_options)) {
   3401        if (!allow_none) {
   3402            error_setg(errp, "A block device must be specified for \"%s\"",
   3403                       bdref_key);
   3404        }
   3405        qobject_unref(image_options);
   3406        goto done;
   3407    }
   3408
   3409    bs = bdrv_open_inherit(filename, reference, image_options, 0,
   3410                           parent, child_class, child_role, errp);
   3411    if (!bs) {
   3412        goto done;
   3413    }
   3414
   3415done:
   3416    qdict_del(options, bdref_key);
   3417    return bs;
   3418}
   3419
   3420/*
   3421 * Opens a disk image whose options are given as BlockdevRef in another block
   3422 * device's options.
   3423 *
   3424 * If allow_none is true, no image will be opened if filename is false and no
   3425 * BlockdevRef is given. NULL will be returned, but errp remains unset.
   3426 *
   3427 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
   3428 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
   3429 * itself, all options starting with "${bdref_key}." are considered part of the
   3430 * BlockdevRef.
   3431 *
   3432 * The BlockdevRef will be removed from the options QDict.
   3433 */
   3434BdrvChild *bdrv_open_child(const char *filename,
   3435                           QDict *options, const char *bdref_key,
   3436                           BlockDriverState *parent,
   3437                           const BdrvChildClass *child_class,
   3438                           BdrvChildRole child_role,
   3439                           bool allow_none, Error **errp)
   3440{
   3441    BlockDriverState *bs;
   3442
   3443    bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_class,
   3444                            child_role, allow_none, errp);
   3445    if (bs == NULL) {
   3446        return NULL;
   3447    }
   3448
   3449    return bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
   3450                             errp);
   3451}
   3452
   3453/*
   3454 * TODO Future callers may need to specify parent/child_class in order for
   3455 * option inheritance to work. Existing callers use it for the root node.
   3456 */
   3457BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp)
   3458{
   3459    BlockDriverState *bs = NULL;
   3460    QObject *obj = NULL;
   3461    QDict *qdict = NULL;
   3462    const char *reference = NULL;
   3463    Visitor *v = NULL;
   3464
   3465    if (ref->type == QTYPE_QSTRING) {
   3466        reference = ref->u.reference;
   3467    } else {
   3468        BlockdevOptions *options = &ref->u.definition;
   3469        assert(ref->type == QTYPE_QDICT);
   3470
   3471        v = qobject_output_visitor_new(&obj);
   3472        visit_type_BlockdevOptions(v, NULL, &options, &error_abort);
   3473        visit_complete(v, &obj);
   3474
   3475        qdict = qobject_to(QDict, obj);
   3476        qdict_flatten(qdict);
   3477
   3478        /* bdrv_open_inherit() defaults to the values in bdrv_flags (for
   3479         * compatibility with other callers) rather than what we want as the
   3480         * real defaults. Apply the defaults here instead. */
   3481        qdict_set_default_str(qdict, BDRV_OPT_CACHE_DIRECT, "off");
   3482        qdict_set_default_str(qdict, BDRV_OPT_CACHE_NO_FLUSH, "off");
   3483        qdict_set_default_str(qdict, BDRV_OPT_READ_ONLY, "off");
   3484        qdict_set_default_str(qdict, BDRV_OPT_AUTO_READ_ONLY, "off");
   3485
   3486    }
   3487
   3488    bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, 0, errp);
   3489    obj = NULL;
   3490    qobject_unref(obj);
   3491    visit_free(v);
   3492    return bs;
   3493}
   3494
   3495static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
   3496                                                   int flags,
   3497                                                   QDict *snapshot_options,
   3498                                                   Error **errp)
   3499{
   3500    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
   3501    char *tmp_filename = g_malloc0(PATH_MAX + 1);
   3502    int64_t total_size;
   3503    QemuOpts *opts = NULL;
   3504    BlockDriverState *bs_snapshot = NULL;
   3505    int ret;
   3506
   3507    /* if snapshot, we create a temporary backing file and open it
   3508       instead of opening 'filename' directly */
   3509
   3510    /* Get the required size from the image */
   3511    total_size = bdrv_getlength(bs);
   3512    if (total_size < 0) {
   3513        error_setg_errno(errp, -total_size, "Could not get image size");
   3514        goto out;
   3515    }
   3516
   3517    /* Create the temporary image */
   3518    ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
   3519    if (ret < 0) {
   3520        error_setg_errno(errp, -ret, "Could not get temporary filename");
   3521        goto out;
   3522    }
   3523
   3524    opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
   3525                            &error_abort);
   3526    qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
   3527    ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, errp);
   3528    qemu_opts_del(opts);
   3529    if (ret < 0) {
   3530        error_prepend(errp, "Could not create temporary overlay '%s': ",
   3531                      tmp_filename);
   3532        goto out;
   3533    }
   3534
   3535    /* Prepare options QDict for the temporary file */
   3536    qdict_put_str(snapshot_options, "file.driver", "file");
   3537    qdict_put_str(snapshot_options, "file.filename", tmp_filename);
   3538    qdict_put_str(snapshot_options, "driver", "qcow2");
   3539
   3540    bs_snapshot = bdrv_open(NULL, NULL, snapshot_options, flags, errp);
   3541    snapshot_options = NULL;
   3542    if (!bs_snapshot) {
   3543        goto out;
   3544    }
   3545
   3546    ret = bdrv_append(bs_snapshot, bs, errp);
   3547    if (ret < 0) {
   3548        bs_snapshot = NULL;
   3549        goto out;
   3550    }
   3551
   3552out:
   3553    qobject_unref(snapshot_options);
   3554    g_free(tmp_filename);
   3555    return bs_snapshot;
   3556}
   3557
   3558/*
   3559 * Opens a disk image (raw, qcow2, vmdk, ...)
   3560 *
   3561 * options is a QDict of options to pass to the block drivers, or NULL for an
   3562 * empty set of options. The reference to the QDict belongs to the block layer
   3563 * after the call (even on failure), so if the caller intends to reuse the
   3564 * dictionary, it needs to use qobject_ref() before calling bdrv_open.
   3565 *
   3566 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
   3567 * If it is not NULL, the referenced BDS will be reused.
   3568 *
   3569 * The reference parameter may be used to specify an existing block device which
   3570 * should be opened. If specified, neither options nor a filename may be given,
   3571 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
   3572 */
   3573static BlockDriverState *bdrv_open_inherit(const char *filename,
   3574                                           const char *reference,
   3575                                           QDict *options, int flags,
   3576                                           BlockDriverState *parent,
   3577                                           const BdrvChildClass *child_class,
   3578                                           BdrvChildRole child_role,
   3579                                           Error **errp)
   3580{
   3581    int ret;
   3582    BlockBackend *file = NULL;
   3583    BlockDriverState *bs;
   3584    BlockDriver *drv = NULL;
   3585    BdrvChild *child;
   3586    const char *drvname;
   3587    const char *backing;
   3588    Error *local_err = NULL;
   3589    QDict *snapshot_options = NULL;
   3590    int snapshot_flags = 0;
   3591
   3592    assert(!child_class || !flags);
   3593    assert(!child_class == !parent);
   3594
   3595    if (reference) {
   3596        bool options_non_empty = options ? qdict_size(options) : false;
   3597        qobject_unref(options);
   3598
   3599        if (filename || options_non_empty) {
   3600            error_setg(errp, "Cannot reference an existing block device with "
   3601                       "additional options or a new filename");
   3602            return NULL;
   3603        }
   3604
   3605        bs = bdrv_lookup_bs(reference, reference, errp);
   3606        if (!bs) {
   3607            return NULL;
   3608        }
   3609
   3610        bdrv_ref(bs);
   3611        return bs;
   3612    }
   3613
   3614    bs = bdrv_new();
   3615
   3616    /* NULL means an empty set of options */
   3617    if (options == NULL) {
   3618        options = qdict_new();
   3619    }
   3620
   3621    /* json: syntax counts as explicit options, as if in the QDict */
   3622    parse_json_protocol(options, &filename, &local_err);
   3623    if (local_err) {
   3624        goto fail;
   3625    }
   3626
   3627    bs->explicit_options = qdict_clone_shallow(options);
   3628
   3629    if (child_class) {
   3630        bool parent_is_format;
   3631
   3632        if (parent->drv) {
   3633            parent_is_format = parent->drv->is_format;
   3634        } else {
   3635            /*
   3636             * parent->drv is not set yet because this node is opened for
   3637             * (potential) format probing.  That means that @parent is going
   3638             * to be a format node.
   3639             */
   3640            parent_is_format = true;
   3641        }
   3642
   3643        bs->inherits_from = parent;
   3644        child_class->inherit_options(child_role, parent_is_format,
   3645                                     &flags, options,
   3646                                     parent->open_flags, parent->options);
   3647    }
   3648
   3649    ret = bdrv_fill_options(&options, filename, &flags, &local_err);
   3650    if (ret < 0) {
   3651        goto fail;
   3652    }
   3653
   3654    /*
   3655     * Set the BDRV_O_RDWR and BDRV_O_ALLOW_RDWR flags.
   3656     * Caution: getting a boolean member of @options requires care.
   3657     * When @options come from -blockdev or blockdev_add, members are
   3658     * typed according to the QAPI schema, but when they come from
   3659     * -drive, they're all QString.
   3660     */
   3661    if (g_strcmp0(qdict_get_try_str(options, BDRV_OPT_READ_ONLY), "on") &&
   3662        !qdict_get_try_bool(options, BDRV_OPT_READ_ONLY, false)) {
   3663        flags |= (BDRV_O_RDWR | BDRV_O_ALLOW_RDWR);
   3664    } else {
   3665        flags &= ~BDRV_O_RDWR;
   3666    }
   3667
   3668    if (flags & BDRV_O_SNAPSHOT) {
   3669        snapshot_options = qdict_new();
   3670        bdrv_temp_snapshot_options(&snapshot_flags, snapshot_options,
   3671                                   flags, options);
   3672        /* Let bdrv_backing_options() override "read-only" */
   3673        qdict_del(options, BDRV_OPT_READ_ONLY);
   3674        bdrv_inherited_options(BDRV_CHILD_COW, true,
   3675                               &flags, options, flags, options);
   3676    }
   3677
   3678    bs->open_flags = flags;
   3679    bs->options = options;
   3680    options = qdict_clone_shallow(options);
   3681
   3682    /* Find the right image format driver */
   3683    /* See cautionary note on accessing @options above */
   3684    drvname = qdict_get_try_str(options, "driver");
   3685    if (drvname) {
   3686        drv = bdrv_find_format(drvname);
   3687        if (!drv) {
   3688            error_setg(errp, "Unknown driver: '%s'", drvname);
   3689            goto fail;
   3690        }
   3691    }
   3692
   3693    assert(drvname || !(flags & BDRV_O_PROTOCOL));
   3694
   3695    /* See cautionary note on accessing @options above */
   3696    backing = qdict_get_try_str(options, "backing");
   3697    if (qobject_to(QNull, qdict_get(options, "backing")) != NULL ||
   3698        (backing && *backing == '\0'))
   3699    {
   3700        if (backing) {
   3701            warn_report("Use of \"backing\": \"\" is deprecated; "
   3702                        "use \"backing\": null instead");
   3703        }
   3704        flags |= BDRV_O_NO_BACKING;
   3705        qdict_del(bs->explicit_options, "backing");
   3706        qdict_del(bs->options, "backing");
   3707        qdict_del(options, "backing");
   3708    }
   3709
   3710    /* Open image file without format layer. This BlockBackend is only used for
   3711     * probing, the block drivers will do their own bdrv_open_child() for the
   3712     * same BDS, which is why we put the node name back into options. */
   3713    if ((flags & BDRV_O_PROTOCOL) == 0) {
   3714        BlockDriverState *file_bs;
   3715
   3716        file_bs = bdrv_open_child_bs(filename, options, "file", bs,
   3717                                     &child_of_bds, BDRV_CHILD_IMAGE,
   3718                                     true, &local_err);
   3719        if (local_err) {
   3720            goto fail;
   3721        }
   3722        if (file_bs != NULL) {
   3723            /* Not requesting BLK_PERM_CONSISTENT_READ because we're only
   3724             * looking at the header to guess the image format. This works even
   3725             * in cases where a guest would not see a consistent state. */
   3726            file = blk_new(bdrv_get_aio_context(file_bs), 0, BLK_PERM_ALL);
   3727            blk_insert_bs(file, file_bs, &local_err);
   3728            bdrv_unref(file_bs);
   3729            if (local_err) {
   3730                goto fail;
   3731            }
   3732
   3733            qdict_put_str(options, "file", bdrv_get_node_name(file_bs));
   3734        }
   3735    }
   3736
   3737    /* Image format probing */
   3738    bs->probed = !drv;
   3739    if (!drv && file) {
   3740        ret = find_image_format(file, filename, &drv, &local_err);
   3741        if (ret < 0) {
   3742            goto fail;
   3743        }
   3744        /*
   3745         * This option update would logically belong in bdrv_fill_options(),
   3746         * but we first need to open bs->file for the probing to work, while
   3747         * opening bs->file already requires the (mostly) final set of options
   3748         * so that cache mode etc. can be inherited.
   3749         *
   3750         * Adding the driver later is somewhat ugly, but it's not an option
   3751         * that would ever be inherited, so it's correct. We just need to make
   3752         * sure to update both bs->options (which has the full effective
   3753         * options for bs) and options (which has file.* already removed).
   3754         */
   3755        qdict_put_str(bs->options, "driver", drv->format_name);
   3756        qdict_put_str(options, "driver", drv->format_name);
   3757    } else if (!drv) {
   3758        error_setg(errp, "Must specify either driver or file");
   3759        goto fail;
   3760    }
   3761
   3762    /* BDRV_O_PROTOCOL must be set iff a protocol BDS is about to be created */
   3763    assert(!!(flags & BDRV_O_PROTOCOL) == !!drv->bdrv_file_open);
   3764    /* file must be NULL if a protocol BDS is about to be created
   3765     * (the inverse results in an error message from bdrv_open_common()) */
   3766    assert(!(flags & BDRV_O_PROTOCOL) || !file);
   3767
   3768    /* Open the image */
   3769    ret = bdrv_open_common(bs, file, options, &local_err);
   3770    if (ret < 0) {
   3771        goto fail;
   3772    }
   3773
   3774    if (file) {
   3775        blk_unref(file);
   3776        file = NULL;
   3777    }
   3778
   3779    /* If there is a backing file, use it */
   3780    if ((flags & BDRV_O_NO_BACKING) == 0) {
   3781        ret = bdrv_open_backing_file(bs, options, "backing", &local_err);
   3782        if (ret < 0) {
   3783            goto close_and_fail;
   3784        }
   3785    }
   3786
   3787    /* Remove all children options and references
   3788     * from bs->options and bs->explicit_options */
   3789    QLIST_FOREACH(child, &bs->children, next) {
   3790        char *child_key_dot;
   3791        child_key_dot = g_strdup_printf("%s.", child->name);
   3792        qdict_extract_subqdict(bs->explicit_options, NULL, child_key_dot);
   3793        qdict_extract_subqdict(bs->options, NULL, child_key_dot);
   3794        qdict_del(bs->explicit_options, child->name);
   3795        qdict_del(bs->options, child->name);
   3796        g_free(child_key_dot);
   3797    }
   3798
   3799    /* Check if any unknown options were used */
   3800    if (qdict_size(options) != 0) {
   3801        const QDictEntry *entry = qdict_first(options);
   3802        if (flags & BDRV_O_PROTOCOL) {
   3803            error_setg(errp, "Block protocol '%s' doesn't support the option "
   3804                       "'%s'", drv->format_name, entry->key);
   3805        } else {
   3806            error_setg(errp,
   3807                       "Block format '%s' does not support the option '%s'",
   3808                       drv->format_name, entry->key);
   3809        }
   3810
   3811        goto close_and_fail;
   3812    }
   3813
   3814    bdrv_parent_cb_change_media(bs, true);
   3815
   3816    qobject_unref(options);
   3817    options = NULL;
   3818
   3819    /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
   3820     * temporary snapshot afterwards. */
   3821    if (snapshot_flags) {
   3822        BlockDriverState *snapshot_bs;
   3823        snapshot_bs = bdrv_append_temp_snapshot(bs, snapshot_flags,
   3824                                                snapshot_options, &local_err);
   3825        snapshot_options = NULL;
   3826        if (local_err) {
   3827            goto close_and_fail;
   3828        }
   3829        /* We are not going to return bs but the overlay on top of it
   3830         * (snapshot_bs); thus, we have to drop the strong reference to bs
   3831         * (which we obtained by calling bdrv_new()). bs will not be deleted,
   3832         * though, because the overlay still has a reference to it. */
   3833        bdrv_unref(bs);
   3834        bs = snapshot_bs;
   3835    }
   3836
   3837    return bs;
   3838
   3839fail:
   3840    blk_unref(file);
   3841    qobject_unref(snapshot_options);
   3842    qobject_unref(bs->explicit_options);
   3843    qobject_unref(bs->options);
   3844    qobject_unref(options);
   3845    bs->options = NULL;
   3846    bs->explicit_options = NULL;
   3847    bdrv_unref(bs);
   3848    error_propagate(errp, local_err);
   3849    return NULL;
   3850
   3851close_and_fail:
   3852    bdrv_unref(bs);
   3853    qobject_unref(snapshot_options);
   3854    qobject_unref(options);
   3855    error_propagate(errp, local_err);
   3856    return NULL;
   3857}
   3858
   3859BlockDriverState *bdrv_open(const char *filename, const char *reference,
   3860                            QDict *options, int flags, Error **errp)
   3861{
   3862    return bdrv_open_inherit(filename, reference, options, flags, NULL,
   3863                             NULL, 0, errp);
   3864}
   3865
   3866/* Return true if the NULL-terminated @list contains @str */
   3867static bool is_str_in_list(const char *str, const char *const *list)
   3868{
   3869    if (str && list) {
   3870        int i;
   3871        for (i = 0; list[i] != NULL; i++) {
   3872            if (!strcmp(str, list[i])) {
   3873                return true;
   3874            }
   3875        }
   3876    }
   3877    return false;
   3878}
   3879
   3880/*
   3881 * Check that every option set in @bs->options is also set in
   3882 * @new_opts.
   3883 *
   3884 * Options listed in the common_options list and in
   3885 * @bs->drv->mutable_opts are skipped.
   3886 *
   3887 * Return 0 on success, otherwise return -EINVAL and set @errp.
   3888 */
   3889static int bdrv_reset_options_allowed(BlockDriverState *bs,
   3890                                      const QDict *new_opts, Error **errp)
   3891{
   3892    const QDictEntry *e;
   3893    /* These options are common to all block drivers and are handled
   3894     * in bdrv_reopen_prepare() so they can be left out of @new_opts */
   3895    const char *const common_options[] = {
   3896        "node-name", "discard", "cache.direct", "cache.no-flush",
   3897        "read-only", "auto-read-only", "detect-zeroes", NULL
   3898    };
   3899
   3900    for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) {
   3901        if (!qdict_haskey(new_opts, e->key) &&
   3902            !is_str_in_list(e->key, common_options) &&
   3903            !is_str_in_list(e->key, bs->drv->mutable_opts)) {
   3904            error_setg(errp, "Option '%s' cannot be reset "
   3905                       "to its default value", e->key);
   3906            return -EINVAL;
   3907        }
   3908    }
   3909
   3910    return 0;
   3911}
   3912
   3913/*
   3914 * Returns true if @child can be reached recursively from @bs
   3915 */
   3916static bool bdrv_recurse_has_child(BlockDriverState *bs,
   3917                                   BlockDriverState *child)
   3918{
   3919    BdrvChild *c;
   3920
   3921    if (bs == child) {
   3922        return true;
   3923    }
   3924
   3925    QLIST_FOREACH(c, &bs->children, next) {
   3926        if (bdrv_recurse_has_child(c->bs, child)) {
   3927            return true;
   3928        }
   3929    }
   3930
   3931    return false;
   3932}
   3933
   3934/*
   3935 * Adds a BlockDriverState to a simple queue for an atomic, transactional
   3936 * reopen of multiple devices.
   3937 *
   3938 * bs_queue can either be an existing BlockReopenQueue that has had QTAILQ_INIT
   3939 * already performed, or alternatively may be NULL a new BlockReopenQueue will
   3940 * be created and initialized. This newly created BlockReopenQueue should be
   3941 * passed back in for subsequent calls that are intended to be of the same
   3942 * atomic 'set'.
   3943 *
   3944 * bs is the BlockDriverState to add to the reopen queue.
   3945 *
   3946 * options contains the changed options for the associated bs
   3947 * (the BlockReopenQueue takes ownership)
   3948 *
   3949 * flags contains the open flags for the associated bs
   3950 *
   3951 * returns a pointer to bs_queue, which is either the newly allocated
   3952 * bs_queue, or the existing bs_queue being used.
   3953 *
   3954 * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
   3955 */
   3956static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
   3957                                                 BlockDriverState *bs,
   3958                                                 QDict *options,
   3959                                                 const BdrvChildClass *klass,
   3960                                                 BdrvChildRole role,
   3961                                                 bool parent_is_format,
   3962                                                 QDict *parent_options,
   3963                                                 int parent_flags,
   3964                                                 bool keep_old_opts)
   3965{
   3966    assert(bs != NULL);
   3967
   3968    BlockReopenQueueEntry *bs_entry;
   3969    BdrvChild *child;
   3970    QDict *old_options, *explicit_options, *options_copy;
   3971    int flags;
   3972    QemuOpts *opts;
   3973
   3974    /* Make sure that the caller remembered to use a drained section. This is
   3975     * important to avoid graph changes between the recursive queuing here and
   3976     * bdrv_reopen_multiple(). */
   3977    assert(bs->quiesce_counter > 0);
   3978
   3979    if (bs_queue == NULL) {
   3980        bs_queue = g_new0(BlockReopenQueue, 1);
   3981        QTAILQ_INIT(bs_queue);
   3982    }
   3983
   3984    if (!options) {
   3985        options = qdict_new();
   3986    }
   3987
   3988    /* Check if this BlockDriverState is already in the queue */
   3989    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
   3990        if (bs == bs_entry->state.bs) {
   3991            break;
   3992        }
   3993    }
   3994
   3995    /*
   3996     * Precedence of options:
   3997     * 1. Explicitly passed in options (highest)
   3998     * 2. Retained from explicitly set options of bs
   3999     * 3. Inherited from parent node
   4000     * 4. Retained from effective options of bs
   4001     */
   4002
   4003    /* Old explicitly set values (don't overwrite by inherited value) */
   4004    if (bs_entry || keep_old_opts) {
   4005        old_options = qdict_clone_shallow(bs_entry ?
   4006                                          bs_entry->state.explicit_options :
   4007                                          bs->explicit_options);
   4008        bdrv_join_options(bs, options, old_options);
   4009        qobject_unref(old_options);
   4010    }
   4011
   4012    explicit_options = qdict_clone_shallow(options);
   4013
   4014    /* Inherit from parent node */
   4015    if (parent_options) {
   4016        flags = 0;
   4017        klass->inherit_options(role, parent_is_format, &flags, options,
   4018                               parent_flags, parent_options);
   4019    } else {
   4020        flags = bdrv_get_flags(bs);
   4021    }
   4022
   4023    if (keep_old_opts) {
   4024        /* Old values are used for options that aren't set yet */
   4025        old_options = qdict_clone_shallow(bs->options);
   4026        bdrv_join_options(bs, options, old_options);
   4027        qobject_unref(old_options);
   4028    }
   4029
   4030    /* We have the final set of options so let's update the flags */
   4031    options_copy = qdict_clone_shallow(options);
   4032    opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
   4033    qemu_opts_absorb_qdict(opts, options_copy, NULL);
   4034    update_flags_from_options(&flags, opts);
   4035    qemu_opts_del(opts);
   4036    qobject_unref(options_copy);
   4037
   4038    /* bdrv_open_inherit() sets and clears some additional flags internally */
   4039    flags &= ~BDRV_O_PROTOCOL;
   4040    if (flags & BDRV_O_RDWR) {
   4041        flags |= BDRV_O_ALLOW_RDWR;
   4042    }
   4043
   4044    if (!bs_entry) {
   4045        bs_entry = g_new0(BlockReopenQueueEntry, 1);
   4046        QTAILQ_INSERT_TAIL(bs_queue, bs_entry, entry);
   4047    } else {
   4048        qobject_unref(bs_entry->state.options);
   4049        qobject_unref(bs_entry->state.explicit_options);
   4050    }
   4051
   4052    bs_entry->state.bs = bs;
   4053    bs_entry->state.options = options;
   4054    bs_entry->state.explicit_options = explicit_options;
   4055    bs_entry->state.flags = flags;
   4056
   4057    /*
   4058     * If keep_old_opts is false then it means that unspecified
   4059     * options must be reset to their original value. We don't allow
   4060     * resetting 'backing' but we need to know if the option is
   4061     * missing in order to decide if we have to return an error.
   4062     */
   4063    if (!keep_old_opts) {
   4064        bs_entry->state.backing_missing =
   4065            !qdict_haskey(options, "backing") &&
   4066            !qdict_haskey(options, "backing.driver");
   4067    }
   4068
   4069    QLIST_FOREACH(child, &bs->children, next) {
   4070        QDict *new_child_options = NULL;
   4071        bool child_keep_old = keep_old_opts;
   4072
   4073        /* reopen can only change the options of block devices that were
   4074         * implicitly created and inherited options. For other (referenced)
   4075         * block devices, a syntax like "backing.foo" results in an error. */
   4076        if (child->bs->inherits_from != bs) {
   4077            continue;
   4078        }
   4079
   4080        /* Check if the options contain a child reference */
   4081        if (qdict_haskey(options, child->name)) {
   4082            const char *childref = qdict_get_try_str(options, child->name);
   4083            /*
   4084             * The current child must not be reopened if the child
   4085             * reference is null or points to a different node.
   4086             */
   4087            if (g_strcmp0(childref, child->bs->node_name)) {
   4088                continue;
   4089            }
   4090            /*
   4091             * If the child reference points to the current child then
   4092             * reopen it with its existing set of options (note that
   4093             * it can still inherit new options from the parent).
   4094             */
   4095            child_keep_old = true;
   4096        } else {
   4097            /* Extract child options ("child-name.*") */
   4098            char *child_key_dot = g_strdup_printf("%s.", child->name);
   4099            qdict_extract_subqdict(explicit_options, NULL, child_key_dot);
   4100            qdict_extract_subqdict(options, &new_child_options, child_key_dot);
   4101            g_free(child_key_dot);
   4102        }
   4103
   4104        bdrv_reopen_queue_child(bs_queue, child->bs, new_child_options,
   4105                                child->klass, child->role, bs->drv->is_format,
   4106                                options, flags, child_keep_old);
   4107    }
   4108
   4109    return bs_queue;
   4110}
   4111
   4112BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
   4113                                    BlockDriverState *bs,
   4114                                    QDict *options, bool keep_old_opts)
   4115{
   4116    return bdrv_reopen_queue_child(bs_queue, bs, options, NULL, 0, false,
   4117                                   NULL, 0, keep_old_opts);
   4118}
   4119
   4120void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue)
   4121{
   4122    if (bs_queue) {
   4123        BlockReopenQueueEntry *bs_entry, *next;
   4124        QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
   4125            qobject_unref(bs_entry->state.explicit_options);
   4126            qobject_unref(bs_entry->state.options);
   4127            g_free(bs_entry);
   4128        }
   4129        g_free(bs_queue);
   4130    }
   4131}
   4132
   4133/*
   4134 * Reopen multiple BlockDriverStates atomically & transactionally.
   4135 *
   4136 * The queue passed in (bs_queue) must have been built up previous
   4137 * via bdrv_reopen_queue().
   4138 *
   4139 * Reopens all BDS specified in the queue, with the appropriate
   4140 * flags.  All devices are prepared for reopen, and failure of any
   4141 * device will cause all device changes to be abandoned, and intermediate
   4142 * data cleaned up.
   4143 *
   4144 * If all devices prepare successfully, then the changes are committed
   4145 * to all devices.
   4146 *
   4147 * All affected nodes must be drained between bdrv_reopen_queue() and
   4148 * bdrv_reopen_multiple().
   4149 *
   4150 * To be called from the main thread, with all other AioContexts unlocked.
   4151 */
   4152int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
   4153{
   4154    int ret = -1;
   4155    BlockReopenQueueEntry *bs_entry, *next;
   4156    AioContext *ctx;
   4157    Transaction *tran = tran_new();
   4158    g_autoptr(GHashTable) found = NULL;
   4159    g_autoptr(GSList) refresh_list = NULL;
   4160
   4161    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
   4162    assert(bs_queue != NULL);
   4163
   4164    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
   4165        ctx = bdrv_get_aio_context(bs_entry->state.bs);
   4166        aio_context_acquire(ctx);
   4167        ret = bdrv_flush(bs_entry->state.bs);
   4168        aio_context_release(ctx);
   4169        if (ret < 0) {
   4170            error_setg_errno(errp, -ret, "Error flushing drive");
   4171            goto abort;
   4172        }
   4173    }
   4174
   4175    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
   4176        assert(bs_entry->state.bs->quiesce_counter > 0);
   4177        ctx = bdrv_get_aio_context(bs_entry->state.bs);
   4178        aio_context_acquire(ctx);
   4179        ret = bdrv_reopen_prepare(&bs_entry->state, bs_queue, tran, errp);
   4180        aio_context_release(ctx);
   4181        if (ret < 0) {
   4182            goto abort;
   4183        }
   4184        bs_entry->prepared = true;
   4185    }
   4186
   4187    found = g_hash_table_new(NULL, NULL);
   4188    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
   4189        BDRVReopenState *state = &bs_entry->state;
   4190
   4191        refresh_list = bdrv_topological_dfs(refresh_list, found, state->bs);
   4192        if (state->old_backing_bs) {
   4193            refresh_list = bdrv_topological_dfs(refresh_list, found,
   4194                                                state->old_backing_bs);
   4195        }
   4196        if (state->old_file_bs) {
   4197            refresh_list = bdrv_topological_dfs(refresh_list, found,
   4198                                                state->old_file_bs);
   4199        }
   4200    }
   4201
   4202    /*
   4203     * Note that file-posix driver rely on permission update done during reopen
   4204     * (even if no permission changed), because it wants "new" permissions for
   4205     * reconfiguring the fd and that's why it does it in raw_check_perm(), not
   4206     * in raw_reopen_prepare() which is called with "old" permissions.
   4207     */
   4208    ret = bdrv_list_refresh_perms(refresh_list, bs_queue, tran, errp);
   4209    if (ret < 0) {
   4210        goto abort;
   4211    }
   4212
   4213    /*
   4214     * If we reach this point, we have success and just need to apply the
   4215     * changes.
   4216     *
   4217     * Reverse order is used to comfort qcow2 driver: on commit it need to write
   4218     * IN_USE flag to the image, to mark bitmaps in the image as invalid. But
   4219     * children are usually goes after parents in reopen-queue, so go from last
   4220     * to first element.
   4221     */
   4222    QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
   4223        ctx = bdrv_get_aio_context(bs_entry->state.bs);
   4224        aio_context_acquire(ctx);
   4225        bdrv_reopen_commit(&bs_entry->state);
   4226        aio_context_release(ctx);
   4227    }
   4228
   4229    tran_commit(tran);
   4230
   4231    QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
   4232        BlockDriverState *bs = bs_entry->state.bs;
   4233
   4234        if (bs->drv->bdrv_reopen_commit_post) {
   4235            ctx = bdrv_get_aio_context(bs);
   4236            aio_context_acquire(ctx);
   4237            bs->drv->bdrv_reopen_commit_post(&bs_entry->state);
   4238            aio_context_release(ctx);
   4239        }
   4240    }
   4241
   4242    ret = 0;
   4243    goto cleanup;
   4244
   4245abort:
   4246    tran_abort(tran);
   4247    QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
   4248        if (bs_entry->prepared) {
   4249            ctx = bdrv_get_aio_context(bs_entry->state.bs);
   4250            aio_context_acquire(ctx);
   4251            bdrv_reopen_abort(&bs_entry->state);
   4252            aio_context_release(ctx);
   4253        }
   4254    }
   4255
   4256cleanup:
   4257    bdrv_reopen_queue_free(bs_queue);
   4258
   4259    return ret;
   4260}
   4261
   4262int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts,
   4263                Error **errp)
   4264{
   4265    AioContext *ctx = bdrv_get_aio_context(bs);
   4266    BlockReopenQueue *queue;
   4267    int ret;
   4268
   4269    bdrv_subtree_drained_begin(bs);
   4270    if (ctx != qemu_get_aio_context()) {
   4271        aio_context_release(ctx);
   4272    }
   4273
   4274    queue = bdrv_reopen_queue(NULL, bs, opts, keep_old_opts);
   4275    ret = bdrv_reopen_multiple(queue, errp);
   4276
   4277    if (ctx != qemu_get_aio_context()) {
   4278        aio_context_acquire(ctx);
   4279    }
   4280    bdrv_subtree_drained_end(bs);
   4281
   4282    return ret;
   4283}
   4284
   4285int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
   4286                              Error **errp)
   4287{
   4288    QDict *opts = qdict_new();
   4289
   4290    qdict_put_bool(opts, BDRV_OPT_READ_ONLY, read_only);
   4291
   4292    return bdrv_reopen(bs, opts, true, errp);
   4293}
   4294
   4295/*
   4296 * Take a BDRVReopenState and check if the value of 'backing' in the
   4297 * reopen_state->options QDict is valid or not.
   4298 *
   4299 * If 'backing' is missing from the QDict then return 0.
   4300 *
   4301 * If 'backing' contains the node name of the backing file of
   4302 * reopen_state->bs then return 0.
   4303 *
   4304 * If 'backing' contains a different node name (or is null) then check
   4305 * whether the current backing file can be replaced with the new one.
   4306 * If that's the case then reopen_state->replace_backing_bs is set to
   4307 * true and reopen_state->new_backing_bs contains a pointer to the new
   4308 * backing BlockDriverState (or NULL).
   4309 *
   4310 * Return 0 on success, otherwise return < 0 and set @errp.
   4311 */
   4312static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
   4313                                             bool is_backing, Transaction *tran,
   4314                                             Error **errp)
   4315{
   4316    BlockDriverState *bs = reopen_state->bs;
   4317    BlockDriverState *new_child_bs;
   4318    BlockDriverState *old_child_bs = is_backing ? child_bs(bs->backing) :
   4319                                                  child_bs(bs->file);
   4320    const char *child_name = is_backing ? "backing" : "file";
   4321    QObject *value;
   4322    const char *str;
   4323
   4324    value = qdict_get(reopen_state->options, child_name);
   4325    if (value == NULL) {
   4326        return 0;
   4327    }
   4328
   4329    switch (qobject_type(value)) {
   4330    case QTYPE_QNULL:
   4331        assert(is_backing); /* The 'file' option does not allow a null value */
   4332        new_child_bs = NULL;
   4333        break;
   4334    case QTYPE_QSTRING:
   4335        str = qstring_get_str(qobject_to(QString, value));
   4336        new_child_bs = bdrv_lookup_bs(NULL, str, errp);
   4337        if (new_child_bs == NULL) {
   4338            return -EINVAL;
   4339        } else if (bdrv_recurse_has_child(new_child_bs, bs)) {
   4340            error_setg(errp, "Making '%s' a %s child of '%s' would create a "
   4341                       "cycle", str, child_name, bs->node_name);
   4342            return -EINVAL;
   4343        }
   4344        break;
   4345    default:
   4346        /*
   4347         * The options QDict has been flattened, so 'backing' and 'file'
   4348         * do not allow any other data type here.
   4349         */
   4350        g_assert_not_reached();
   4351    }
   4352
   4353    if (old_child_bs == new_child_bs) {
   4354        return 0;
   4355    }
   4356
   4357    if (old_child_bs) {
   4358        if (bdrv_skip_implicit_filters(old_child_bs) == new_child_bs) {
   4359            return 0;
   4360        }
   4361
   4362        if (old_child_bs->implicit) {
   4363            error_setg(errp, "Cannot replace implicit %s child of %s",
   4364                       child_name, bs->node_name);
   4365            return -EPERM;
   4366        }
   4367    }
   4368
   4369    if (bs->drv->is_filter && !old_child_bs) {
   4370        /*
   4371         * Filters always have a file or a backing child, so we are trying to
   4372         * change wrong child
   4373         */
   4374        error_setg(errp, "'%s' is a %s filter node that does not support a "
   4375                   "%s child", bs->node_name, bs->drv->format_name, child_name);
   4376        return -EINVAL;
   4377    }
   4378
   4379    if (is_backing) {
   4380        reopen_state->old_backing_bs = old_child_bs;
   4381    } else {
   4382        reopen_state->old_file_bs = old_child_bs;
   4383    }
   4384
   4385    return bdrv_set_file_or_backing_noperm(bs, new_child_bs, is_backing,
   4386                                           tran, errp);
   4387}
   4388
   4389/*
   4390 * Prepares a BlockDriverState for reopen. All changes are staged in the
   4391 * 'opaque' field of the BDRVReopenState, which is used and allocated by
   4392 * the block driver layer .bdrv_reopen_prepare()
   4393 *
   4394 * bs is the BlockDriverState to reopen
   4395 * flags are the new open flags
   4396 * queue is the reopen queue
   4397 *
   4398 * Returns 0 on success, non-zero on error.  On error errp will be set
   4399 * as well.
   4400 *
   4401 * On failure, bdrv_reopen_abort() will be called to clean up any data.
   4402 * It is the responsibility of the caller to then call the abort() or
   4403 * commit() for any other BDS that have been left in a prepare() state
   4404 *
   4405 */
   4406static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
   4407                               BlockReopenQueue *queue,
   4408                               Transaction *change_child_tran, Error **errp)
   4409{
   4410    int ret = -1;
   4411    int old_flags;
   4412    Error *local_err = NULL;
   4413    BlockDriver *drv;
   4414    QemuOpts *opts;
   4415    QDict *orig_reopen_opts;
   4416    char *discard = NULL;
   4417    bool read_only;
   4418    bool drv_prepared = false;
   4419
   4420    assert(reopen_state != NULL);
   4421    assert(reopen_state->bs->drv != NULL);
   4422    drv = reopen_state->bs->drv;
   4423
   4424    /* This function and each driver's bdrv_reopen_prepare() remove
   4425     * entries from reopen_state->options as they are processed, so
   4426     * we need to make a copy of the original QDict. */
   4427    orig_reopen_opts = qdict_clone_shallow(reopen_state->options);
   4428
   4429    /* Process generic block layer options */
   4430    opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
   4431    if (!qemu_opts_absorb_qdict(opts, reopen_state->options, errp)) {
   4432        ret = -EINVAL;
   4433        goto error;
   4434    }
   4435
   4436    /* This was already called in bdrv_reopen_queue_child() so the flags
   4437     * are up-to-date. This time we simply want to remove the options from
   4438     * QemuOpts in order to indicate that they have been processed. */
   4439    old_flags = reopen_state->flags;
   4440    update_flags_from_options(&reopen_state->flags, opts);
   4441    assert(old_flags == reopen_state->flags);
   4442
   4443    discard = qemu_opt_get_del(opts, BDRV_OPT_DISCARD);
   4444    if (discard != NULL) {
   4445        if (bdrv_parse_discard_flags(discard, &reopen_state->flags) != 0) {
   4446            error_setg(errp, "Invalid discard option");
   4447            ret = -EINVAL;
   4448            goto error;
   4449        }
   4450    }
   4451
   4452    reopen_state->detect_zeroes =
   4453        bdrv_parse_detect_zeroes(opts, reopen_state->flags, &local_err);
   4454    if (local_err) {
   4455        error_propagate(errp, local_err);
   4456        ret = -EINVAL;
   4457        goto error;
   4458    }
   4459
   4460    /* All other options (including node-name and driver) must be unchanged.
   4461     * Put them back into the QDict, so that they are checked at the end
   4462     * of this function. */
   4463    qemu_opts_to_qdict(opts, reopen_state->options);
   4464
   4465    /* If we are to stay read-only, do not allow permission change
   4466     * to r/w. Attempting to set to r/w may fail if either BDRV_O_ALLOW_RDWR is
   4467     * not set, or if the BDS still has copy_on_read enabled */
   4468    read_only = !(reopen_state->flags & BDRV_O_RDWR);
   4469    ret = bdrv_can_set_read_only(reopen_state->bs, read_only, true, &local_err);
   4470    if (local_err) {
   4471        error_propagate(errp, local_err);
   4472        goto error;
   4473    }
   4474
   4475    if (drv->bdrv_reopen_prepare) {
   4476        /*
   4477         * If a driver-specific option is missing, it means that we
   4478         * should reset it to its default value.
   4479         * But not all options allow that, so we need to check it first.
   4480         */
   4481        ret = bdrv_reset_options_allowed(reopen_state->bs,
   4482                                         reopen_state->options, errp);
   4483        if (ret) {
   4484            goto error;
   4485        }
   4486
   4487        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
   4488        if (ret) {
   4489            if (local_err != NULL) {
   4490                error_propagate(errp, local_err);
   4491            } else {
   4492                bdrv_refresh_filename(reopen_state->bs);
   4493                error_setg(errp, "failed while preparing to reopen image '%s'",
   4494                           reopen_state->bs->filename);
   4495            }
   4496            goto error;
   4497        }
   4498    } else {
   4499        /* It is currently mandatory to have a bdrv_reopen_prepare()
   4500         * handler for each supported drv. */
   4501        error_setg(errp, "Block format '%s' used by node '%s' "
   4502                   "does not support reopening files", drv->format_name,
   4503                   bdrv_get_device_or_node_name(reopen_state->bs));
   4504        ret = -1;
   4505        goto error;
   4506    }
   4507
   4508    drv_prepared = true;
   4509
   4510    /*
   4511     * We must provide the 'backing' option if the BDS has a backing
   4512     * file or if the image file has a backing file name as part of
   4513     * its metadata. Otherwise the 'backing' option can be omitted.
   4514     */
   4515    if (drv->supports_backing && reopen_state->backing_missing &&
   4516        (reopen_state->bs->backing || reopen_state->bs->backing_file[0])) {
   4517        error_setg(errp, "backing is missing for '%s'",
   4518                   reopen_state->bs->node_name);
   4519        ret = -EINVAL;
   4520        goto error;
   4521    }
   4522
   4523    /*
   4524     * Allow changing the 'backing' option. The new value can be
   4525     * either a reference to an existing node (using its node name)
   4526     * or NULL to simply detach the current backing file.
   4527     */
   4528    ret = bdrv_reopen_parse_file_or_backing(reopen_state, true,
   4529                                            change_child_tran, errp);
   4530    if (ret < 0) {
   4531        goto error;
   4532    }
   4533    qdict_del(reopen_state->options, "backing");
   4534
   4535    /* Allow changing the 'file' option. In this case NULL is not allowed */
   4536    ret = bdrv_reopen_parse_file_or_backing(reopen_state, false,
   4537                                            change_child_tran, errp);
   4538    if (ret < 0) {
   4539        goto error;
   4540    }
   4541    qdict_del(reopen_state->options, "file");
   4542
   4543    /* Options that are not handled are only okay if they are unchanged
   4544     * compared to the old state. It is expected that some options are only
   4545     * used for the initial open, but not reopen (e.g. filename) */
   4546    if (qdict_size(reopen_state->options)) {
   4547        const QDictEntry *entry = qdict_first(reopen_state->options);
   4548
   4549        do {
   4550            QObject *new = entry->value;
   4551            QObject *old = qdict_get(reopen_state->bs->options, entry->key);
   4552
   4553            /* Allow child references (child_name=node_name) as long as they
   4554             * point to the current child (i.e. everything stays the same). */
   4555            if (qobject_type(new) == QTYPE_QSTRING) {
   4556                BdrvChild *child;
   4557                QLIST_FOREACH(child, &reopen_state->bs->children, next) {
   4558                    if (!strcmp(child->name, entry->key)) {
   4559                        break;
   4560                    }
   4561                }
   4562
   4563                if (child) {
   4564                    if (!strcmp(child->bs->node_name,
   4565                                qstring_get_str(qobject_to(QString, new)))) {
   4566                        continue; /* Found child with this name, skip option */
   4567                    }
   4568                }
   4569            }
   4570
   4571            /*
   4572             * TODO: When using -drive to specify blockdev options, all values
   4573             * will be strings; however, when using -blockdev, blockdev-add or
   4574             * filenames using the json:{} pseudo-protocol, they will be
   4575             * correctly typed.
   4576             * In contrast, reopening options are (currently) always strings
   4577             * (because you can only specify them through qemu-io; all other
   4578             * callers do not specify any options).
   4579             * Therefore, when using anything other than -drive to create a BDS,
   4580             * this cannot detect non-string options as unchanged, because
   4581             * qobject_is_equal() always returns false for objects of different
   4582             * type.  In the future, this should be remedied by correctly typing
   4583             * all options.  For now, this is not too big of an issue because
   4584             * the user can simply omit options which cannot be changed anyway,
   4585             * so they will stay unchanged.
   4586             */
   4587            if (!qobject_is_equal(new, old)) {
   4588                error_setg(errp, "Cannot change the option '%s'", entry->key);
   4589                ret = -EINVAL;
   4590                goto error;
   4591            }
   4592        } while ((entry = qdict_next(reopen_state->options, entry)));
   4593    }
   4594
   4595    ret = 0;
   4596
   4597    /* Restore the original reopen_state->options QDict */
   4598    qobject_unref(reopen_state->options);
   4599    reopen_state->options = qobject_ref(orig_reopen_opts);
   4600
   4601error:
   4602    if (ret < 0 && drv_prepared) {
   4603        /* drv->bdrv_reopen_prepare() has succeeded, so we need to
   4604         * call drv->bdrv_reopen_abort() before signaling an error
   4605         * (bdrv_reopen_multiple() will not call bdrv_reopen_abort()
   4606         * when the respective bdrv_reopen_prepare() has failed) */
   4607        if (drv->bdrv_reopen_abort) {
   4608            drv->bdrv_reopen_abort(reopen_state);
   4609        }
   4610    }
   4611    qemu_opts_del(opts);
   4612    qobject_unref(orig_reopen_opts);
   4613    g_free(discard);
   4614    return ret;
   4615}
   4616
   4617/*
   4618 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
   4619 * makes them final by swapping the staging BlockDriverState contents into
   4620 * the active BlockDriverState contents.
   4621 */
   4622static void bdrv_reopen_commit(BDRVReopenState *reopen_state)
   4623{
   4624    BlockDriver *drv;
   4625    BlockDriverState *bs;
   4626    BdrvChild *child;
   4627
   4628    assert(reopen_state != NULL);
   4629    bs = reopen_state->bs;
   4630    drv = bs->drv;
   4631    assert(drv != NULL);
   4632
   4633    /* If there are any driver level actions to take */
   4634    if (drv->bdrv_reopen_commit) {
   4635        drv->bdrv_reopen_commit(reopen_state);
   4636    }
   4637
   4638    /* set BDS specific flags now */
   4639    qobject_unref(bs->explicit_options);
   4640    qobject_unref(bs->options);
   4641    qobject_ref(reopen_state->explicit_options);
   4642    qobject_ref(reopen_state->options);
   4643
   4644    bs->explicit_options   = reopen_state->explicit_options;
   4645    bs->options            = reopen_state->options;
   4646    bs->open_flags         = reopen_state->flags;
   4647    bs->detect_zeroes      = reopen_state->detect_zeroes;
   4648
   4649    /* Remove child references from bs->options and bs->explicit_options.
   4650     * Child options were already removed in bdrv_reopen_queue_child() */
   4651    QLIST_FOREACH(child, &bs->children, next) {
   4652        qdict_del(bs->explicit_options, child->name);
   4653        qdict_del(bs->options, child->name);
   4654    }
   4655    /* backing is probably removed, so it's not handled by previous loop */
   4656    qdict_del(bs->explicit_options, "backing");
   4657    qdict_del(bs->options, "backing");
   4658
   4659    bdrv_refresh_limits(bs, NULL, NULL);
   4660}
   4661
   4662/*
   4663 * Abort the reopen, and delete and free the staged changes in
   4664 * reopen_state
   4665 */
   4666static void bdrv_reopen_abort(BDRVReopenState *reopen_state)
   4667{
   4668    BlockDriver *drv;
   4669
   4670    assert(reopen_state != NULL);
   4671    drv = reopen_state->bs->drv;
   4672    assert(drv != NULL);
   4673
   4674    if (drv->bdrv_reopen_abort) {
   4675        drv->bdrv_reopen_abort(reopen_state);
   4676    }
   4677}
   4678
   4679
   4680static void bdrv_close(BlockDriverState *bs)
   4681{
   4682    BdrvAioNotifier *ban, *ban_next;
   4683    BdrvChild *child, *next;
   4684
   4685    assert(!bs->refcnt);
   4686
   4687    bdrv_drained_begin(bs); /* complete I/O */
   4688    bdrv_flush(bs);
   4689    bdrv_drain(bs); /* in case flush left pending I/O */
   4690
   4691    if (bs->drv) {
   4692        if (bs->drv->bdrv_close) {
   4693            /* Must unfreeze all children, so bdrv_unref_child() works */
   4694            bs->drv->bdrv_close(bs);
   4695        }
   4696        bs->drv = NULL;
   4697    }
   4698
   4699    QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
   4700        bdrv_unref_child(bs, child);
   4701    }
   4702
   4703    bs->backing = NULL;
   4704    bs->file = NULL;
   4705    g_free(bs->opaque);
   4706    bs->opaque = NULL;
   4707    qatomic_set(&bs->copy_on_read, 0);
   4708    bs->backing_file[0] = '\0';
   4709    bs->backing_format[0] = '\0';
   4710    bs->total_sectors = 0;
   4711    bs->encrypted = false;
   4712    bs->sg = false;
   4713    qobject_unref(bs->options);
   4714    qobject_unref(bs->explicit_options);
   4715    bs->options = NULL;
   4716    bs->explicit_options = NULL;
   4717    qobject_unref(bs->full_open_options);
   4718    bs->full_open_options = NULL;
   4719    g_free(bs->block_status_cache);
   4720    bs->block_status_cache = NULL;
   4721
   4722    bdrv_release_named_dirty_bitmaps(bs);
   4723    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
   4724
   4725    QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
   4726        g_free(ban);
   4727    }
   4728    QLIST_INIT(&bs->aio_notifiers);
   4729    bdrv_drained_end(bs);
   4730
   4731    /*
   4732     * If we're still inside some bdrv_drain_all_begin()/end() sections, end
   4733     * them now since this BDS won't exist anymore when bdrv_drain_all_end()
   4734     * gets called.
   4735     */
   4736    if (bs->quiesce_counter) {
   4737        bdrv_drain_all_end_quiesce(bs);
   4738    }
   4739}
   4740
   4741void bdrv_close_all(void)
   4742{
   4743    assert(job_next(NULL) == NULL);
   4744
   4745    /* Drop references from requests still in flight, such as canceled block
   4746     * jobs whose AIO context has not been polled yet */
   4747    bdrv_drain_all();
   4748
   4749    blk_remove_all_bs();
   4750    blockdev_close_all_bdrv_states();
   4751
   4752    assert(QTAILQ_EMPTY(&all_bdrv_states));
   4753}
   4754
   4755static bool should_update_child(BdrvChild *c, BlockDriverState *to)
   4756{
   4757    GQueue *queue;
   4758    GHashTable *found;
   4759    bool ret;
   4760
   4761    if (c->klass->stay_at_node) {
   4762        return false;
   4763    }
   4764
   4765    /* If the child @c belongs to the BDS @to, replacing the current
   4766     * c->bs by @to would mean to create a loop.
   4767     *
   4768     * Such a case occurs when appending a BDS to a backing chain.
   4769     * For instance, imagine the following chain:
   4770     *
   4771     *   guest device -> node A -> further backing chain...
   4772     *
   4773     * Now we create a new BDS B which we want to put on top of this
   4774     * chain, so we first attach A as its backing node:
   4775     *
   4776     *                   node B
   4777     *                     |
   4778     *                     v
   4779     *   guest device -> node A -> further backing chain...
   4780     *
   4781     * Finally we want to replace A by B.  When doing that, we want to
   4782     * replace all pointers to A by pointers to B -- except for the
   4783     * pointer from B because (1) that would create a loop, and (2)
   4784     * that pointer should simply stay intact:
   4785     *
   4786     *   guest device -> node B
   4787     *                     |
   4788     *                     v
   4789     *                   node A -> further backing chain...
   4790     *
   4791     * In general, when replacing a node A (c->bs) by a node B (@to),
   4792     * if A is a child of B, that means we cannot replace A by B there
   4793     * because that would create a loop.  Silently detaching A from B
   4794     * is also not really an option.  So overall just leaving A in
   4795     * place there is the most sensible choice.
   4796     *
   4797     * We would also create a loop in any cases where @c is only
   4798     * indirectly referenced by @to. Prevent this by returning false
   4799     * if @c is found (by breadth-first search) anywhere in the whole
   4800     * subtree of @to.
   4801     */
   4802
   4803    ret = true;
   4804    found = g_hash_table_new(NULL, NULL);
   4805    g_hash_table_add(found, to);
   4806    queue = g_queue_new();
   4807    g_queue_push_tail(queue, to);
   4808
   4809    while (!g_queue_is_empty(queue)) {
   4810        BlockDriverState *v = g_queue_pop_head(queue);
   4811        BdrvChild *c2;
   4812
   4813        QLIST_FOREACH(c2, &v->children, next) {
   4814            if (c2 == c) {
   4815                ret = false;
   4816                break;
   4817            }
   4818
   4819            if (g_hash_table_contains(found, c2->bs)) {
   4820                continue;
   4821            }
   4822
   4823            g_queue_push_tail(queue, c2->bs);
   4824            g_hash_table_add(found, c2->bs);
   4825        }
   4826    }
   4827
   4828    g_queue_free(queue);
   4829    g_hash_table_destroy(found);
   4830
   4831    return ret;
   4832}
   4833
   4834typedef struct BdrvRemoveFilterOrCowChild {
   4835    BdrvChild *child;
   4836    bool is_backing;
   4837} BdrvRemoveFilterOrCowChild;
   4838
   4839static void bdrv_remove_filter_or_cow_child_abort(void *opaque)
   4840{
   4841    BdrvRemoveFilterOrCowChild *s = opaque;
   4842    BlockDriverState *parent_bs = s->child->opaque;
   4843
   4844    QLIST_INSERT_HEAD(&parent_bs->children, s->child, next);
   4845    if (s->is_backing) {
   4846        parent_bs->backing = s->child;
   4847    } else {
   4848        parent_bs->file = s->child;
   4849    }
   4850
   4851    /*
   4852     * We don't have to restore child->bs here to undo bdrv_replace_child_tran()
   4853     * because that function is transactionable and it registered own completion
   4854     * entries in @tran, so .abort() for bdrv_replace_child_safe() will be
   4855     * called automatically.
   4856     */
   4857}
   4858
   4859static void bdrv_remove_filter_or_cow_child_commit(void *opaque)
   4860{
   4861    BdrvRemoveFilterOrCowChild *s = opaque;
   4862
   4863    bdrv_child_free(s->child);
   4864}
   4865
   4866static TransactionActionDrv bdrv_remove_filter_or_cow_child_drv = {
   4867    .abort = bdrv_remove_filter_or_cow_child_abort,
   4868    .commit = bdrv_remove_filter_or_cow_child_commit,
   4869    .clean = g_free,
   4870};
   4871
   4872/*
   4873 * A function to remove backing or file child of @bs.
   4874 * Function doesn't update permissions, caller is responsible for this.
   4875 */
   4876static void bdrv_remove_file_or_backing_child(BlockDriverState *bs,
   4877                                              BdrvChild *child,
   4878                                              Transaction *tran)
   4879{
   4880    BdrvRemoveFilterOrCowChild *s;
   4881
   4882    assert(child == bs->backing || child == bs->file);
   4883
   4884    if (!child) {
   4885        return;
   4886    }
   4887
   4888    if (child->bs) {
   4889        bdrv_replace_child_tran(child, NULL, tran);
   4890    }
   4891
   4892    s = g_new(BdrvRemoveFilterOrCowChild, 1);
   4893    *s = (BdrvRemoveFilterOrCowChild) {
   4894        .child = child,
   4895        .is_backing = (child == bs->backing),
   4896    };
   4897    tran_add(tran, &bdrv_remove_filter_or_cow_child_drv, s);
   4898
   4899    QLIST_SAFE_REMOVE(child, next);
   4900    if (s->is_backing) {
   4901        bs->backing = NULL;
   4902    } else {
   4903        bs->file = NULL;
   4904    }
   4905}
   4906
   4907/*
   4908 * A function to remove backing-chain child of @bs if exists: cow child for
   4909 * format nodes (always .backing) and filter child for filters (may be .file or
   4910 * .backing)
   4911 */
   4912static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs,
   4913                                            Transaction *tran)
   4914{
   4915    bdrv_remove_file_or_backing_child(bs, bdrv_filter_or_cow_child(bs), tran);
   4916}
   4917
   4918static int bdrv_replace_node_noperm(BlockDriverState *from,
   4919                                    BlockDriverState *to,
   4920                                    bool auto_skip, Transaction *tran,
   4921                                    Error **errp)
   4922{
   4923    BdrvChild *c, *next;
   4924
   4925    QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
   4926        assert(c->bs == from);
   4927        if (!should_update_child(c, to)) {
   4928            if (auto_skip) {
   4929                continue;
   4930            }
   4931            error_setg(errp, "Should not change '%s' link to '%s'",
   4932                       c->name, from->node_name);
   4933            return -EINVAL;
   4934        }
   4935        if (c->frozen) {
   4936            error_setg(errp, "Cannot change '%s' link to '%s'",
   4937                       c->name, from->node_name);
   4938            return -EPERM;
   4939        }
   4940        bdrv_replace_child_tran(c, to, tran);
   4941    }
   4942
   4943    return 0;
   4944}
   4945
   4946/*
   4947 * With auto_skip=true bdrv_replace_node_common skips updating from parents
   4948 * if it creates a parent-child relation loop or if parent is block-job.
   4949 *
   4950 * With auto_skip=false the error is returned if from has a parent which should
   4951 * not be updated.
   4952 *
   4953 * With @detach_subchain=true @to must be in a backing chain of @from. In this
   4954 * case backing link of the cow-parent of @to is removed.
   4955 */
   4956static int bdrv_replace_node_common(BlockDriverState *from,
   4957                                    BlockDriverState *to,
   4958                                    bool auto_skip, bool detach_subchain,
   4959                                    Error **errp)
   4960{
   4961    Transaction *tran = tran_new();
   4962    g_autoptr(GHashTable) found = NULL;
   4963    g_autoptr(GSList) refresh_list = NULL;
   4964    BlockDriverState *to_cow_parent = NULL;
   4965    int ret;
   4966
   4967    if (detach_subchain) {
   4968        assert(bdrv_chain_contains(from, to));
   4969        assert(from != to);
   4970        for (to_cow_parent = from;
   4971             bdrv_filter_or_cow_bs(to_cow_parent) != to;
   4972             to_cow_parent = bdrv_filter_or_cow_bs(to_cow_parent))
   4973        {
   4974            ;
   4975        }
   4976    }
   4977
   4978    /* Make sure that @from doesn't go away until we have successfully attached
   4979     * all of its parents to @to. */
   4980    bdrv_ref(from);
   4981
   4982    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
   4983    assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to));
   4984    bdrv_drained_begin(from);
   4985
   4986    /*
   4987     * Do the replacement without permission update.
   4988     * Replacement may influence the permissions, we should calculate new
   4989     * permissions based on new graph. If we fail, we'll roll-back the
   4990     * replacement.
   4991     */
   4992    ret = bdrv_replace_node_noperm(from, to, auto_skip, tran, errp);
   4993    if (ret < 0) {
   4994        goto out;
   4995    }
   4996
   4997    if (detach_subchain) {
   4998        bdrv_remove_filter_or_cow_child(to_cow_parent, tran);
   4999    }
   5000
   5001    found = g_hash_table_new(NULL, NULL);
   5002
   5003    refresh_list = bdrv_topological_dfs(refresh_list, found, to);
   5004    refresh_list = bdrv_topological_dfs(refresh_list, found, from);
   5005
   5006    ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
   5007    if (ret < 0) {
   5008        goto out;
   5009    }
   5010
   5011    ret = 0;
   5012
   5013out:
   5014    tran_finalize(tran, ret);
   5015
   5016    bdrv_drained_end(from);
   5017    bdrv_unref(from);
   5018
   5019    return ret;
   5020}
   5021
   5022int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
   5023                      Error **errp)
   5024{
   5025    return bdrv_replace_node_common(from, to, true, false, errp);
   5026}
   5027
   5028int bdrv_drop_filter(BlockDriverState *bs, Error **errp)
   5029{
   5030    return bdrv_replace_node_common(bs, bdrv_filter_or_cow_bs(bs), true, true,
   5031                                    errp);
   5032}
   5033
   5034/*
   5035 * Add new bs contents at the top of an image chain while the chain is
   5036 * live, while keeping required fields on the top layer.
   5037 *
   5038 * This will modify the BlockDriverState fields, and swap contents
   5039 * between bs_new and bs_top. Both bs_new and bs_top are modified.
   5040 *
   5041 * bs_new must not be attached to a BlockBackend and must not have backing
   5042 * child.
   5043 *
   5044 * This function does not create any image files.
   5045 */
   5046int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
   5047                Error **errp)
   5048{
   5049    int ret;
   5050    Transaction *tran = tran_new();
   5051
   5052    assert(!bs_new->backing);
   5053
   5054    ret = bdrv_attach_child_noperm(bs_new, bs_top, "backing",
   5055                                   &child_of_bds, bdrv_backing_role(bs_new),
   5056                                   &bs_new->backing, tran, errp);
   5057    if (ret < 0) {
   5058        goto out;
   5059    }
   5060
   5061    ret = bdrv_replace_node_noperm(bs_top, bs_new, true, tran, errp);
   5062    if (ret < 0) {
   5063        goto out;
   5064    }
   5065
   5066    ret = bdrv_refresh_perms(bs_new, errp);
   5067out:
   5068    tran_finalize(tran, ret);
   5069
   5070    bdrv_refresh_limits(bs_top, NULL, NULL);
   5071
   5072    return ret;
   5073}
   5074
   5075/* Not for empty child */
   5076int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,
   5077                          Error **errp)
   5078{
   5079    int ret;
   5080    Transaction *tran = tran_new();
   5081    g_autoptr(GHashTable) found = NULL;
   5082    g_autoptr(GSList) refresh_list = NULL;
   5083    BlockDriverState *old_bs = child->bs;
   5084
   5085    bdrv_ref(old_bs);
   5086    bdrv_drained_begin(old_bs);
   5087    bdrv_drained_begin(new_bs);
   5088
   5089    bdrv_replace_child_tran(child, new_bs, tran);
   5090
   5091    found = g_hash_table_new(NULL, NULL);
   5092    refresh_list = bdrv_topological_dfs(refresh_list, found, old_bs);
   5093    refresh_list = bdrv_topological_dfs(refresh_list, found, new_bs);
   5094
   5095    ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
   5096
   5097    tran_finalize(tran, ret);
   5098
   5099    bdrv_drained_end(old_bs);
   5100    bdrv_drained_end(new_bs);
   5101    bdrv_unref(old_bs);
   5102
   5103    return ret;
   5104}
   5105
   5106static void bdrv_delete(BlockDriverState *bs)
   5107{
   5108    assert(bdrv_op_blocker_is_empty(bs));
   5109    assert(!bs->refcnt);
   5110
   5111    /* remove from list, if necessary */
   5112    if (bs->node_name[0] != '\0') {
   5113        QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
   5114    }
   5115    QTAILQ_REMOVE(&all_bdrv_states, bs, bs_list);
   5116
   5117    bdrv_close(bs);
   5118
   5119    g_free(bs);
   5120}
   5121
   5122
   5123/*
   5124 * Replace @bs by newly created block node.
   5125 *
   5126 * @options is a QDict of options to pass to the block drivers, or NULL for an
   5127 * empty set of options. The reference to the QDict belongs to the block layer
   5128 * after the call (even on failure), so if the caller intends to reuse the
   5129 * dictionary, it needs to use qobject_ref() before calling bdrv_open.
   5130 */
   5131BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *options,
   5132                                   int flags, Error **errp)
   5133{
   5134    ERRP_GUARD();
   5135    int ret;
   5136    BlockDriverState *new_node_bs = NULL;
   5137    const char *drvname, *node_name;
   5138    BlockDriver *drv;
   5139
   5140    drvname = qdict_get_try_str(options, "driver");
   5141    if (!drvname) {
   5142        error_setg(errp, "driver is not specified");
   5143        goto fail;
   5144    }
   5145
   5146    drv = bdrv_find_format(drvname);
   5147    if (!drv) {
   5148        error_setg(errp, "Unknown driver: '%s'", drvname);
   5149        goto fail;
   5150    }
   5151
   5152    node_name = qdict_get_try_str(options, "node-name");
   5153
   5154    new_node_bs = bdrv_new_open_driver_opts(drv, node_name, options, flags,
   5155                                            errp);
   5156    options = NULL; /* bdrv_new_open_driver() eats options */
   5157    if (!new_node_bs) {
   5158        error_prepend(errp, "Could not create node: ");
   5159        goto fail;
   5160    }
   5161
   5162    bdrv_drained_begin(bs);
   5163    ret = bdrv_replace_node(bs, new_node_bs, errp);
   5164    bdrv_drained_end(bs);
   5165
   5166    if (ret < 0) {
   5167        error_prepend(errp, "Could not replace node: ");
   5168        goto fail;
   5169    }
   5170
   5171    return new_node_bs;
   5172
   5173fail:
   5174    qobject_unref(options);
   5175    bdrv_unref(new_node_bs);
   5176    return NULL;
   5177}
   5178
   5179/*
   5180 * Run consistency checks on an image
   5181 *
   5182 * Returns 0 if the check could be completed (it doesn't mean that the image is
   5183 * free of errors) or -errno when an internal error occurred. The results of the
   5184 * check are stored in res.
   5185 */
   5186int coroutine_fn bdrv_co_check(BlockDriverState *bs,
   5187                               BdrvCheckResult *res, BdrvCheckMode fix)
   5188{
   5189    if (bs->drv == NULL) {
   5190        return -ENOMEDIUM;
   5191    }
   5192    if (bs->drv->bdrv_co_check == NULL) {
   5193        return -ENOTSUP;
   5194    }
   5195
   5196    memset(res, 0, sizeof(*res));
   5197    return bs->drv->bdrv_co_check(bs, res, fix);
   5198}
   5199
   5200/*
   5201 * Return values:
   5202 * 0        - success
   5203 * -EINVAL  - backing format specified, but no file
   5204 * -ENOSPC  - can't update the backing file because no space is left in the
   5205 *            image file header
   5206 * -ENOTSUP - format driver doesn't support changing the backing file
   5207 */
   5208int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file,
   5209                             const char *backing_fmt, bool require)
   5210{
   5211    BlockDriver *drv = bs->drv;
   5212    int ret;
   5213
   5214    if (!drv) {
   5215        return -ENOMEDIUM;
   5216    }
   5217
   5218    /* Backing file format doesn't make sense without a backing file */
   5219    if (backing_fmt && !backing_file) {
   5220        return -EINVAL;
   5221    }
   5222
   5223    if (require && backing_file && !backing_fmt) {
   5224        return -EINVAL;
   5225    }
   5226
   5227    if (drv->bdrv_change_backing_file != NULL) {
   5228        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
   5229    } else {
   5230        ret = -ENOTSUP;
   5231    }
   5232
   5233    if (ret == 0) {
   5234        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
   5235        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
   5236        pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
   5237                backing_file ?: "");
   5238    }
   5239    return ret;
   5240}
   5241
   5242/*
   5243 * Finds the first non-filter node above bs in the chain between
   5244 * active and bs.  The returned node is either an immediate parent of
   5245 * bs, or there are only filter nodes between the two.
   5246 *
   5247 * Returns NULL if bs is not found in active's image chain,
   5248 * or if active == bs.
   5249 *
   5250 * Returns the bottommost base image if bs == NULL.
   5251 */
   5252BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
   5253                                    BlockDriverState *bs)
   5254{
   5255    bs = bdrv_skip_filters(bs);
   5256    active = bdrv_skip_filters(active);
   5257
   5258    while (active) {
   5259        BlockDriverState *next = bdrv_backing_chain_next(active);
   5260        if (bs == next) {
   5261            return active;
   5262        }
   5263        active = next;
   5264    }
   5265
   5266    return NULL;
   5267}
   5268
   5269/* Given a BDS, searches for the base layer. */
   5270BlockDriverState *bdrv_find_base(BlockDriverState *bs)
   5271{
   5272    return bdrv_find_overlay(bs, NULL);
   5273}
   5274
   5275/*
   5276 * Return true if at least one of the COW (backing) and filter links
   5277 * between @bs and @base is frozen. @errp is set if that's the case.
   5278 * @base must be reachable from @bs, or NULL.
   5279 */
   5280bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
   5281                                  Error **errp)
   5282{
   5283    BlockDriverState *i;
   5284    BdrvChild *child;
   5285
   5286    for (i = bs; i != base; i = child_bs(child)) {
   5287        child = bdrv_filter_or_cow_child(i);
   5288
   5289        if (child && child->frozen) {
   5290            error_setg(errp, "Cannot change '%s' link from '%s' to '%s'",
   5291                       child->name, i->node_name, child->bs->node_name);
   5292            return true;
   5293        }
   5294    }
   5295
   5296    return false;
   5297}
   5298
   5299/*
   5300 * Freeze all COW (backing) and filter links between @bs and @base.
   5301 * If any of the links is already frozen the operation is aborted and
   5302 * none of the links are modified.
   5303 * @base must be reachable from @bs, or NULL.
   5304 * Returns 0 on success. On failure returns < 0 and sets @errp.
   5305 */
   5306int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base,
   5307                              Error **errp)
   5308{
   5309    BlockDriverState *i;
   5310    BdrvChild *child;
   5311
   5312    if (bdrv_is_backing_chain_frozen(bs, base, errp)) {
   5313        return -EPERM;
   5314    }
   5315
   5316    for (i = bs; i != base; i = child_bs(child)) {
   5317        child = bdrv_filter_or_cow_child(i);
   5318        if (child && child->bs->never_freeze) {
   5319            error_setg(errp, "Cannot freeze '%s' link to '%s'",
   5320                       child->name, child->bs->node_name);
   5321            return -EPERM;
   5322        }
   5323    }
   5324
   5325    for (i = bs; i != base; i = child_bs(child)) {
   5326        child = bdrv_filter_or_cow_child(i);
   5327        if (child) {
   5328            child->frozen = true;
   5329        }
   5330    }
   5331
   5332    return 0;
   5333}
   5334
   5335/*
   5336 * Unfreeze all COW (backing) and filter links between @bs and @base.
   5337 * The caller must ensure that all links are frozen before using this
   5338 * function.
   5339 * @base must be reachable from @bs, or NULL.
   5340 */
   5341void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base)
   5342{
   5343    BlockDriverState *i;
   5344    BdrvChild *child;
   5345
   5346    for (i = bs; i != base; i = child_bs(child)) {
   5347        child = bdrv_filter_or_cow_child(i);
   5348        if (child) {
   5349            assert(child->frozen);
   5350            child->frozen = false;
   5351        }
   5352    }
   5353}
   5354
   5355/*
   5356 * Drops images above 'base' up to and including 'top', and sets the image
   5357 * above 'top' to have base as its backing file.
   5358 *
   5359 * Requires that the overlay to 'top' is opened r/w, so that the backing file
   5360 * information in 'bs' can be properly updated.
   5361 *
   5362 * E.g., this will convert the following chain:
   5363 * bottom <- base <- intermediate <- top <- active
   5364 *
   5365 * to
   5366 *
   5367 * bottom <- base <- active
   5368 *
   5369 * It is allowed for bottom==base, in which case it converts:
   5370 *
   5371 * base <- intermediate <- top <- active
   5372 *
   5373 * to
   5374 *
   5375 * base <- active
   5376 *
   5377 * If backing_file_str is non-NULL, it will be used when modifying top's
   5378 * overlay image metadata.
   5379 *
   5380 * Error conditions:
   5381 *  if active == top, that is considered an error
   5382 *
   5383 */
   5384int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
   5385                           const char *backing_file_str)
   5386{
   5387    BlockDriverState *explicit_top = top;
   5388    bool update_inherits_from;
   5389    BdrvChild *c;
   5390    Error *local_err = NULL;
   5391    int ret = -EIO;
   5392    g_autoptr(GSList) updated_children = NULL;
   5393    GSList *p;
   5394
   5395    bdrv_ref(top);
   5396    bdrv_subtree_drained_begin(top);
   5397
   5398    if (!top->drv || !base->drv) {
   5399        goto exit;
   5400    }
   5401
   5402    /* Make sure that base is in the backing chain of top */
   5403    if (!bdrv_chain_contains(top, base)) {
   5404        goto exit;
   5405    }
   5406
   5407    /* If 'base' recursively inherits from 'top' then we should set
   5408     * base->inherits_from to top->inherits_from after 'top' and all
   5409     * other intermediate nodes have been dropped.
   5410     * If 'top' is an implicit node (e.g. "commit_top") we should skip
   5411     * it because no one inherits from it. We use explicit_top for that. */
   5412    explicit_top = bdrv_skip_implicit_filters(explicit_top);
   5413    update_inherits_from = bdrv_inherits_from_recursive(base, explicit_top);
   5414
   5415    /* success - we can delete the intermediate states, and link top->base */
   5416    /* TODO Check graph modification op blockers (BLK_PERM_GRAPH_MOD) once
   5417     * we've figured out how they should work. */
   5418    if (!backing_file_str) {
   5419        bdrv_refresh_filename(base);
   5420        backing_file_str = base->filename;
   5421    }
   5422
   5423    QLIST_FOREACH(c, &top->parents, next_parent) {
   5424        updated_children = g_slist_prepend(updated_children, c);
   5425    }
   5426
   5427    /*
   5428     * It seems correct to pass detach_subchain=true here, but it triggers
   5429     * one more yet not fixed bug, when due to nested aio_poll loop we switch to
   5430     * another drained section, which modify the graph (for example, removing
   5431     * the child, which we keep in updated_children list). So, it's a TODO.
   5432     *
   5433     * Note, bug triggered if pass detach_subchain=true here and run
   5434     * test-bdrv-drain. test_drop_intermediate_poll() test-case will crash.
   5435     * That's a FIXME.
   5436     */
   5437    bdrv_replace_node_common(top, base, false, false, &local_err);
   5438    if (local_err) {
   5439        error_report_err(local_err);
   5440        goto exit;
   5441    }
   5442
   5443    for (p = updated_children; p; p = p->next) {
   5444        c = p->data;
   5445
   5446        if (c->klass->update_filename) {
   5447            ret = c->klass->update_filename(c, base, backing_file_str,
   5448                                            &local_err);
   5449            if (ret < 0) {
   5450                /*
   5451                 * TODO: Actually, we want to rollback all previous iterations
   5452                 * of this loop, and (which is almost impossible) previous
   5453                 * bdrv_replace_node()...
   5454                 *
   5455                 * Note, that c->klass->update_filename may lead to permission
   5456                 * update, so it's a bad idea to call it inside permission
   5457                 * update transaction of bdrv_replace_node.
   5458                 */
   5459                error_report_err(local_err);
   5460                goto exit;
   5461            }
   5462        }
   5463    }
   5464
   5465    if (update_inherits_from) {
   5466        base->inherits_from = explicit_top->inherits_from;
   5467    }
   5468
   5469    ret = 0;
   5470exit:
   5471    bdrv_subtree_drained_end(top);
   5472    bdrv_unref(top);
   5473    return ret;
   5474}
   5475
   5476/**
   5477 * Implementation of BlockDriver.bdrv_get_allocated_file_size() that
   5478 * sums the size of all data-bearing children.  (This excludes backing
   5479 * children.)
   5480 */
   5481static int64_t bdrv_sum_allocated_file_size(BlockDriverState *bs)
   5482{
   5483    BdrvChild *child;
   5484    int64_t child_size, sum = 0;
   5485
   5486    QLIST_FOREACH(child, &bs->children, next) {
   5487        if (child->role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
   5488                           BDRV_CHILD_FILTERED))
   5489        {
   5490            child_size = bdrv_get_allocated_file_size(child->bs);
   5491            if (child_size < 0) {
   5492                return child_size;
   5493            }
   5494            sum += child_size;
   5495        }
   5496    }
   5497
   5498    return sum;
   5499}
   5500
   5501/**
   5502 * Length of a allocated file in bytes. Sparse files are counted by actual
   5503 * allocated space. Return < 0 if error or unknown.
   5504 */
   5505int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
   5506{
   5507    BlockDriver *drv = bs->drv;
   5508    if (!drv) {
   5509        return -ENOMEDIUM;
   5510    }
   5511    if (drv->bdrv_get_allocated_file_size) {
   5512        return drv->bdrv_get_allocated_file_size(bs);
   5513    }
   5514
   5515    if (drv->bdrv_file_open) {
   5516        /*
   5517         * Protocol drivers default to -ENOTSUP (most of their data is
   5518         * not stored in any of their children (if they even have any),
   5519         * so there is no generic way to figure it out).
   5520         */
   5521        return -ENOTSUP;
   5522    } else if (drv->is_filter) {
   5523        /* Filter drivers default to the size of their filtered child */
   5524        return bdrv_get_allocated_file_size(bdrv_filter_bs(bs));
   5525    } else {
   5526        /* Other drivers default to summing their children's sizes */
   5527        return bdrv_sum_allocated_file_size(bs);
   5528    }
   5529}
   5530
   5531/*
   5532 * bdrv_measure:
   5533 * @drv: Format driver
   5534 * @opts: Creation options for new image
   5535 * @in_bs: Existing image containing data for new image (may be NULL)
   5536 * @errp: Error object
   5537 * Returns: A #BlockMeasureInfo (free using qapi_free_BlockMeasureInfo())
   5538 *          or NULL on error
   5539 *
   5540 * Calculate file size required to create a new image.
   5541 *
   5542 * If @in_bs is given then space for allocated clusters and zero clusters
   5543 * from that image are included in the calculation.  If @opts contains a
   5544 * backing file that is shared by @in_bs then backing clusters may be omitted
   5545 * from the calculation.
   5546 *
   5547 * If @in_bs is NULL then the calculation includes no allocated clusters
   5548 * unless a preallocation option is given in @opts.
   5549 *
   5550 * Note that @in_bs may use a different BlockDriver from @drv.
   5551 *
   5552 * If an error occurs the @errp pointer is set.
   5553 */
   5554BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
   5555                               BlockDriverState *in_bs, Error **errp)
   5556{
   5557    if (!drv->bdrv_measure) {
   5558        error_setg(errp, "Block driver '%s' does not support size measurement",
   5559                   drv->format_name);
   5560        return NULL;
   5561    }
   5562
   5563    return drv->bdrv_measure(opts, in_bs, errp);
   5564}
   5565
   5566/**
   5567 * Return number of sectors on success, -errno on error.
   5568 */
   5569int64_t bdrv_nb_sectors(BlockDriverState *bs)
   5570{
   5571    BlockDriver *drv = bs->drv;
   5572
   5573    if (!drv)
   5574        return -ENOMEDIUM;
   5575
   5576    if (drv->has_variable_length) {
   5577        int ret = refresh_total_sectors(bs, bs->total_sectors);
   5578        if (ret < 0) {
   5579            return ret;
   5580        }
   5581    }
   5582    return bs->total_sectors;
   5583}
   5584
   5585/**
   5586 * Return length in bytes on success, -errno on error.
   5587 * The length is always a multiple of BDRV_SECTOR_SIZE.
   5588 */
   5589int64_t bdrv_getlength(BlockDriverState *bs)
   5590{
   5591    int64_t ret = bdrv_nb_sectors(bs);
   5592
   5593    if (ret < 0) {
   5594        return ret;
   5595    }
   5596    if (ret > INT64_MAX / BDRV_SECTOR_SIZE) {
   5597        return -EFBIG;
   5598    }
   5599    return ret * BDRV_SECTOR_SIZE;
   5600}
   5601
   5602/* return 0 as number of sectors if no device present or error */
   5603void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
   5604{
   5605    int64_t nb_sectors = bdrv_nb_sectors(bs);
   5606
   5607    *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
   5608}
   5609
   5610bool bdrv_is_sg(BlockDriverState *bs)
   5611{
   5612    return bs->sg;
   5613}
   5614
   5615/**
   5616 * Return whether the given node supports compressed writes.
   5617 */
   5618bool bdrv_supports_compressed_writes(BlockDriverState *bs)
   5619{
   5620    BlockDriverState *filtered;
   5621
   5622    if (!bs->drv || !block_driver_can_compress(bs->drv)) {
   5623        return false;
   5624    }
   5625
   5626    filtered = bdrv_filter_bs(bs);
   5627    if (filtered) {
   5628        /*
   5629         * Filters can only forward compressed writes, so we have to
   5630         * check the child.
   5631         */
   5632        return bdrv_supports_compressed_writes(filtered);
   5633    }
   5634
   5635    return true;
   5636}
   5637
   5638const char *bdrv_get_format_name(BlockDriverState *bs)
   5639{
   5640    return bs->drv ? bs->drv->format_name : NULL;
   5641}
   5642
   5643static int qsort_strcmp(const void *a, const void *b)
   5644{
   5645    return strcmp(*(char *const *)a, *(char *const *)b);
   5646}
   5647
   5648void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
   5649                         void *opaque, bool read_only)
   5650{
   5651    BlockDriver *drv;
   5652    int count = 0;
   5653    int i;
   5654    const char **formats = NULL;
   5655
   5656    QLIST_FOREACH(drv, &bdrv_drivers, list) {
   5657        if (drv->format_name) {
   5658            bool found = false;
   5659            int i = count;
   5660
   5661            if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, read_only)) {
   5662                continue;
   5663            }
   5664
   5665            while (formats && i && !found) {
   5666                found = !strcmp(formats[--i], drv->format_name);
   5667            }
   5668
   5669            if (!found) {
   5670                formats = g_renew(const char *, formats, count + 1);
   5671                formats[count++] = drv->format_name;
   5672            }
   5673        }
   5674    }
   5675
   5676    for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); i++) {
   5677        const char *format_name = block_driver_modules[i].format_name;
   5678
   5679        if (format_name) {
   5680            bool found = false;
   5681            int j = count;
   5682
   5683            if (use_bdrv_whitelist &&
   5684                !bdrv_format_is_whitelisted(format_name, read_only)) {
   5685                continue;
   5686            }
   5687
   5688            while (formats && j && !found) {
   5689                found = !strcmp(formats[--j], format_name);
   5690            }
   5691
   5692            if (!found) {
   5693                formats = g_renew(const char *, formats, count + 1);
   5694                formats[count++] = format_name;
   5695            }
   5696        }
   5697    }
   5698
   5699    qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
   5700
   5701    for (i = 0; i < count; i++) {
   5702        it(opaque, formats[i]);
   5703    }
   5704
   5705    g_free(formats);
   5706}
   5707
   5708/* This function is to find a node in the bs graph */
   5709BlockDriverState *bdrv_find_node(const char *node_name)
   5710{
   5711    BlockDriverState *bs;
   5712
   5713    assert(node_name);
   5714
   5715    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
   5716        if (!strcmp(node_name, bs->node_name)) {
   5717            return bs;
   5718        }
   5719    }
   5720    return NULL;
   5721}
   5722
   5723/* Put this QMP function here so it can access the static graph_bdrv_states. */
   5724BlockDeviceInfoList *bdrv_named_nodes_list(bool flat,
   5725                                           Error **errp)
   5726{
   5727    BlockDeviceInfoList *list;
   5728    BlockDriverState *bs;
   5729
   5730    list = NULL;
   5731    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
   5732        BlockDeviceInfo *info = bdrv_block_device_info(NULL, bs, flat, errp);
   5733        if (!info) {
   5734            qapi_free_BlockDeviceInfoList(list);
   5735            return NULL;
   5736        }
   5737        QAPI_LIST_PREPEND(list, info);
   5738    }
   5739
   5740    return list;
   5741}
   5742
   5743typedef struct XDbgBlockGraphConstructor {
   5744    XDbgBlockGraph *graph;
   5745    GHashTable *graph_nodes;
   5746} XDbgBlockGraphConstructor;
   5747
   5748static XDbgBlockGraphConstructor *xdbg_graph_new(void)
   5749{
   5750    XDbgBlockGraphConstructor *gr = g_new(XDbgBlockGraphConstructor, 1);
   5751
   5752    gr->graph = g_new0(XDbgBlockGraph, 1);
   5753    gr->graph_nodes = g_hash_table_new(NULL, NULL);
   5754
   5755    return gr;
   5756}
   5757
   5758static XDbgBlockGraph *xdbg_graph_finalize(XDbgBlockGraphConstructor *gr)
   5759{
   5760    XDbgBlockGraph *graph = gr->graph;
   5761
   5762    g_hash_table_destroy(gr->graph_nodes);
   5763    g_free(gr);
   5764
   5765    return graph;
   5766}
   5767
   5768static uintptr_t xdbg_graph_node_num(XDbgBlockGraphConstructor *gr, void *node)
   5769{
   5770    uintptr_t ret = (uintptr_t)g_hash_table_lookup(gr->graph_nodes, node);
   5771
   5772    if (ret != 0) {
   5773        return ret;
   5774    }
   5775
   5776    /*
   5777     * Start counting from 1, not 0, because 0 interferes with not-found (NULL)
   5778     * answer of g_hash_table_lookup.
   5779     */
   5780    ret = g_hash_table_size(gr->graph_nodes) + 1;
   5781    g_hash_table_insert(gr->graph_nodes, node, (void *)ret);
   5782
   5783    return ret;
   5784}
   5785
   5786static void xdbg_graph_add_node(XDbgBlockGraphConstructor *gr, void *node,
   5787                                XDbgBlockGraphNodeType type, const char *name)
   5788{
   5789    XDbgBlockGraphNode *n;
   5790
   5791    n = g_new0(XDbgBlockGraphNode, 1);
   5792
   5793    n->id = xdbg_graph_node_num(gr, node);
   5794    n->type = type;
   5795    n->name = g_strdup(name);
   5796
   5797    QAPI_LIST_PREPEND(gr->graph->nodes, n);
   5798}
   5799
   5800static void xdbg_graph_add_edge(XDbgBlockGraphConstructor *gr, void *parent,
   5801                                const BdrvChild *child)
   5802{
   5803    BlockPermission qapi_perm;
   5804    XDbgBlockGraphEdge *edge;
   5805
   5806    edge = g_new0(XDbgBlockGraphEdge, 1);
   5807
   5808    edge->parent = xdbg_graph_node_num(gr, parent);
   5809    edge->child = xdbg_graph_node_num(gr, child->bs);
   5810    edge->name = g_strdup(child->name);
   5811
   5812    for (qapi_perm = 0; qapi_perm < BLOCK_PERMISSION__MAX; qapi_perm++) {
   5813        uint64_t flag = bdrv_qapi_perm_to_blk_perm(qapi_perm);
   5814
   5815        if (flag & child->perm) {
   5816            QAPI_LIST_PREPEND(edge->perm, qapi_perm);
   5817        }
   5818        if (flag & child->shared_perm) {
   5819            QAPI_LIST_PREPEND(edge->shared_perm, qapi_perm);
   5820        }
   5821    }
   5822
   5823    QAPI_LIST_PREPEND(gr->graph->edges, edge);
   5824}
   5825
   5826
   5827XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp)
   5828{
   5829    BlockBackend *blk;
   5830    BlockJob *job;
   5831    BlockDriverState *bs;
   5832    BdrvChild *child;
   5833    XDbgBlockGraphConstructor *gr = xdbg_graph_new();
   5834
   5835    for (blk = blk_all_next(NULL); blk; blk = blk_all_next(blk)) {
   5836        char *allocated_name = NULL;
   5837        const char *name = blk_name(blk);
   5838
   5839        if (!*name) {
   5840            name = allocated_name = blk_get_attached_dev_id(blk);
   5841        }
   5842        xdbg_graph_add_node(gr, blk, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_BACKEND,
   5843                           name);
   5844        g_free(allocated_name);
   5845        if (blk_root(blk)) {
   5846            xdbg_graph_add_edge(gr, blk, blk_root(blk));
   5847        }
   5848    }
   5849
   5850    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
   5851        GSList *el;
   5852
   5853        xdbg_graph_add_node(gr, job, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_JOB,
   5854                           job->job.id);
   5855        for (el = job->nodes; el; el = el->next) {
   5856            xdbg_graph_add_edge(gr, job, (BdrvChild *)el->data);
   5857        }
   5858    }
   5859
   5860    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
   5861        xdbg_graph_add_node(gr, bs, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_DRIVER,
   5862                           bs->node_name);
   5863        QLIST_FOREACH(child, &bs->children, next) {
   5864            xdbg_graph_add_edge(gr, bs, child);
   5865        }
   5866    }
   5867
   5868    return xdbg_graph_finalize(gr);
   5869}
   5870
   5871BlockDriverState *bdrv_lookup_bs(const char *device,
   5872                                 const char *node_name,
   5873                                 Error **errp)
   5874{
   5875    BlockBackend *blk;
   5876    BlockDriverState *bs;
   5877
   5878    if (device) {
   5879        blk = blk_by_name(device);
   5880
   5881        if (blk) {
   5882            bs = blk_bs(blk);
   5883            if (!bs) {
   5884                error_setg(errp, "Device '%s' has no medium", device);
   5885            }
   5886
   5887            return bs;
   5888        }
   5889    }
   5890
   5891    if (node_name) {
   5892        bs = bdrv_find_node(node_name);
   5893
   5894        if (bs) {
   5895            return bs;
   5896        }
   5897    }
   5898
   5899    error_setg(errp, "Cannot find device=\'%s\' nor node-name=\'%s\'",
   5900                     device ? device : "",
   5901                     node_name ? node_name : "");
   5902    return NULL;
   5903}
   5904
   5905/* If 'base' is in the same chain as 'top', return true. Otherwise,
   5906 * return false.  If either argument is NULL, return false. */
   5907bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
   5908{
   5909    while (top && top != base) {
   5910        top = bdrv_filter_or_cow_bs(top);
   5911    }
   5912
   5913    return top != NULL;
   5914}
   5915
   5916BlockDriverState *bdrv_next_node(BlockDriverState *bs)
   5917{
   5918    if (!bs) {
   5919        return QTAILQ_FIRST(&graph_bdrv_states);
   5920    }
   5921    return QTAILQ_NEXT(bs, node_list);
   5922}
   5923
   5924BlockDriverState *bdrv_next_all_states(BlockDriverState *bs)
   5925{
   5926    if (!bs) {
   5927        return QTAILQ_FIRST(&all_bdrv_states);
   5928    }
   5929    return QTAILQ_NEXT(bs, bs_list);
   5930}
   5931
   5932const char *bdrv_get_node_name(const BlockDriverState *bs)
   5933{
   5934    return bs->node_name;
   5935}
   5936
   5937const char *bdrv_get_parent_name(const BlockDriverState *bs)
   5938{
   5939    BdrvChild *c;
   5940    const char *name;
   5941
   5942    /* If multiple parents have a name, just pick the first one. */
   5943    QLIST_FOREACH(c, &bs->parents, next_parent) {
   5944        if (c->klass->get_name) {
   5945            name = c->klass->get_name(c);
   5946            if (name && *name) {
   5947                return name;
   5948            }
   5949        }
   5950    }
   5951
   5952    return NULL;
   5953}
   5954
   5955/* TODO check what callers really want: bs->node_name or blk_name() */
   5956const char *bdrv_get_device_name(const BlockDriverState *bs)
   5957{
   5958    return bdrv_get_parent_name(bs) ?: "";
   5959}
   5960
   5961/* This can be used to identify nodes that might not have a device
   5962 * name associated. Since node and device names live in the same
   5963 * namespace, the result is unambiguous. The exception is if both are
   5964 * absent, then this returns an empty (non-null) string. */
   5965const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
   5966{
   5967    return bdrv_get_parent_name(bs) ?: bs->node_name;
   5968}
   5969
   5970int bdrv_get_flags(BlockDriverState *bs)
   5971{
   5972    return bs->open_flags;
   5973}
   5974
   5975int bdrv_has_zero_init_1(BlockDriverState *bs)
   5976{
   5977    return 1;
   5978}
   5979
   5980int bdrv_has_zero_init(BlockDriverState *bs)
   5981{
   5982    BlockDriverState *filtered;
   5983
   5984    if (!bs->drv) {
   5985        return 0;
   5986    }
   5987
   5988    /* If BS is a copy on write image, it is initialized to
   5989       the contents of the base image, which may not be zeroes.  */
   5990    if (bdrv_cow_child(bs)) {
   5991        return 0;
   5992    }
   5993    if (bs->drv->bdrv_has_zero_init) {
   5994        return bs->drv->bdrv_has_zero_init(bs);
   5995    }
   5996
   5997    filtered = bdrv_filter_bs(bs);
   5998    if (filtered) {
   5999        return bdrv_has_zero_init(filtered);
   6000    }
   6001
   6002    /* safe default */
   6003    return 0;
   6004}
   6005
   6006bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
   6007{
   6008    if (!(bs->open_flags & BDRV_O_UNMAP)) {
   6009        return false;
   6010    }
   6011
   6012    return bs->supported_zero_flags & BDRV_REQ_MAY_UNMAP;
   6013}
   6014
   6015void bdrv_get_backing_filename(BlockDriverState *bs,
   6016                               char *filename, int filename_size)
   6017{
   6018    pstrcpy(filename, filename_size, bs->backing_file);
   6019}
   6020
   6021int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
   6022{
   6023    int ret;
   6024    BlockDriver *drv = bs->drv;
   6025    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
   6026    if (!drv) {
   6027        return -ENOMEDIUM;
   6028    }
   6029    if (!drv->bdrv_get_info) {
   6030        BlockDriverState *filtered = bdrv_filter_bs(bs);
   6031        if (filtered) {
   6032            return bdrv_get_info(filtered, bdi);
   6033        }
   6034        return -ENOTSUP;
   6035    }
   6036    memset(bdi, 0, sizeof(*bdi));
   6037    ret = drv->bdrv_get_info(bs, bdi);
   6038    if (ret < 0) {
   6039        return ret;
   6040    }
   6041
   6042    if (bdi->cluster_size > BDRV_MAX_ALIGNMENT) {
   6043        return -EINVAL;
   6044    }
   6045
   6046    return 0;
   6047}
   6048
   6049ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs,
   6050                                          Error **errp)
   6051{
   6052    BlockDriver *drv = bs->drv;
   6053    if (drv && drv->bdrv_get_specific_info) {
   6054        return drv->bdrv_get_specific_info(bs, errp);
   6055    }
   6056    return NULL;
   6057}
   6058
   6059BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs)
   6060{
   6061    BlockDriver *drv = bs->drv;
   6062    if (!drv || !drv->bdrv_get_specific_stats) {
   6063        return NULL;
   6064    }
   6065    return drv->bdrv_get_specific_stats(bs);
   6066}
   6067
   6068void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event)
   6069{
   6070    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
   6071        return;
   6072    }
   6073
   6074    bs->drv->bdrv_debug_event(bs, event);
   6075}
   6076
   6077static BlockDriverState *bdrv_find_debug_node(BlockDriverState *bs)
   6078{
   6079    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
   6080        bs = bdrv_primary_bs(bs);
   6081    }
   6082
   6083    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
   6084        assert(bs->drv->bdrv_debug_remove_breakpoint);
   6085        return bs;
   6086    }
   6087
   6088    return NULL;
   6089}
   6090
   6091int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
   6092                          const char *tag)
   6093{
   6094    bs = bdrv_find_debug_node(bs);
   6095    if (bs) {
   6096        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
   6097    }
   6098
   6099    return -ENOTSUP;
   6100}
   6101
   6102int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
   6103{
   6104    bs = bdrv_find_debug_node(bs);
   6105    if (bs) {
   6106        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
   6107    }
   6108
   6109    return -ENOTSUP;
   6110}
   6111
   6112int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
   6113{
   6114    while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
   6115        bs = bdrv_primary_bs(bs);
   6116    }
   6117
   6118    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
   6119        return bs->drv->bdrv_debug_resume(bs, tag);
   6120    }
   6121
   6122    return -ENOTSUP;
   6123}
   6124
   6125bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
   6126{
   6127    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
   6128        bs = bdrv_primary_bs(bs);
   6129    }
   6130
   6131    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
   6132        return bs->drv->bdrv_debug_is_suspended(bs, tag);
   6133    }
   6134
   6135    return false;
   6136}
   6137
   6138/* backing_file can either be relative, or absolute, or a protocol.  If it is
   6139 * relative, it must be relative to the chain.  So, passing in bs->filename
   6140 * from a BDS as backing_file should not be done, as that may be relative to
   6141 * the CWD rather than the chain. */
   6142BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
   6143        const char *backing_file)
   6144{
   6145    char *filename_full = NULL;
   6146    char *backing_file_full = NULL;
   6147    char *filename_tmp = NULL;
   6148    int is_protocol = 0;
   6149    bool filenames_refreshed = false;
   6150    BlockDriverState *curr_bs = NULL;
   6151    BlockDriverState *retval = NULL;
   6152    BlockDriverState *bs_below;
   6153
   6154    if (!bs || !bs->drv || !backing_file) {
   6155        return NULL;
   6156    }
   6157
   6158    filename_full     = g_malloc(PATH_MAX);
   6159    backing_file_full = g_malloc(PATH_MAX);
   6160
   6161    is_protocol = path_has_protocol(backing_file);
   6162
   6163    /*
   6164     * Being largely a legacy function, skip any filters here
   6165     * (because filters do not have normal filenames, so they cannot
   6166     * match anyway; and allowing json:{} filenames is a bit out of
   6167     * scope).
   6168     */
   6169    for (curr_bs = bdrv_skip_filters(bs);
   6170         bdrv_cow_child(curr_bs) != NULL;
   6171         curr_bs = bs_below)
   6172    {
   6173        bs_below = bdrv_backing_chain_next(curr_bs);
   6174
   6175        if (bdrv_backing_overridden(curr_bs)) {
   6176            /*
   6177             * If the backing file was overridden, we can only compare
   6178             * directly against the backing node's filename.
   6179             */
   6180
   6181            if (!filenames_refreshed) {
   6182                /*
   6183                 * This will automatically refresh all of the
   6184                 * filenames in the rest of the backing chain, so we
   6185                 * only need to do this once.
   6186                 */
   6187                bdrv_refresh_filename(bs_below);
   6188                filenames_refreshed = true;
   6189            }
   6190
   6191            if (strcmp(backing_file, bs_below->filename) == 0) {
   6192                retval = bs_below;
   6193                break;
   6194            }
   6195        } else if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
   6196            /*
   6197             * If either of the filename paths is actually a protocol, then
   6198             * compare unmodified paths; otherwise make paths relative.
   6199             */
   6200            char *backing_file_full_ret;
   6201
   6202            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
   6203                retval = bs_below;
   6204                break;
   6205            }
   6206            /* Also check against the full backing filename for the image */
   6207            backing_file_full_ret = bdrv_get_full_backing_filename(curr_bs,
   6208                                                                   NULL);
   6209            if (backing_file_full_ret) {
   6210                bool equal = strcmp(backing_file, backing_file_full_ret) == 0;
   6211                g_free(backing_file_full_ret);
   6212                if (equal) {
   6213                    retval = bs_below;
   6214                    break;
   6215                }
   6216            }
   6217        } else {
   6218            /* If not an absolute filename path, make it relative to the current
   6219             * image's filename path */
   6220            filename_tmp = bdrv_make_absolute_filename(curr_bs, backing_file,
   6221                                                       NULL);
   6222            /* We are going to compare canonicalized absolute pathnames */
   6223            if (!filename_tmp || !realpath(filename_tmp, filename_full)) {
   6224                g_free(filename_tmp);
   6225                continue;
   6226            }
   6227            g_free(filename_tmp);
   6228
   6229            /* We need to make sure the backing filename we are comparing against
   6230             * is relative to the current image filename (or absolute) */
   6231            filename_tmp = bdrv_get_full_backing_filename(curr_bs, NULL);
   6232            if (!filename_tmp || !realpath(filename_tmp, backing_file_full)) {
   6233                g_free(filename_tmp);
   6234                continue;
   6235            }
   6236            g_free(filename_tmp);
   6237
   6238            if (strcmp(backing_file_full, filename_full) == 0) {
   6239                retval = bs_below;
   6240                break;
   6241            }
   6242        }
   6243    }
   6244
   6245    g_free(filename_full);
   6246    g_free(backing_file_full);
   6247    return retval;
   6248}
   6249
   6250void bdrv_init(void)
   6251{
   6252#ifdef CONFIG_BDRV_WHITELIST_TOOLS
   6253    use_bdrv_whitelist = 1;
   6254#endif
   6255    module_call_init(MODULE_INIT_BLOCK);
   6256}
   6257
   6258void bdrv_init_with_whitelist(void)
   6259{
   6260    use_bdrv_whitelist = 1;
   6261    bdrv_init();
   6262}
   6263
   6264int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp)
   6265{
   6266    BdrvChild *child, *parent;
   6267    Error *local_err = NULL;
   6268    int ret;
   6269    BdrvDirtyBitmap *bm;
   6270
   6271    if (!bs->drv)  {
   6272        return -ENOMEDIUM;
   6273    }
   6274
   6275    QLIST_FOREACH(child, &bs->children, next) {
   6276        bdrv_co_invalidate_cache(child->bs, &local_err);
   6277        if (local_err) {
   6278            error_propagate(errp, local_err);
   6279            return -EINVAL;
   6280        }
   6281    }
   6282
   6283    /*
   6284     * Update permissions, they may differ for inactive nodes.
   6285     *
   6286     * Note that the required permissions of inactive images are always a
   6287     * subset of the permissions required after activating the image. This
   6288     * allows us to just get the permissions upfront without restricting
   6289     * drv->bdrv_invalidate_cache().
   6290     *
   6291     * It also means that in error cases, we don't have to try and revert to
   6292     * the old permissions (which is an operation that could fail, too). We can
   6293     * just keep the extended permissions for the next time that an activation
   6294     * of the image is tried.
   6295     */
   6296    if (bs->open_flags & BDRV_O_INACTIVE) {
   6297        bs->open_flags &= ~BDRV_O_INACTIVE;
   6298        ret = bdrv_refresh_perms(bs, errp);
   6299        if (ret < 0) {
   6300            bs->open_flags |= BDRV_O_INACTIVE;
   6301            return ret;
   6302        }
   6303
   6304        if (bs->drv->bdrv_co_invalidate_cache) {
   6305            bs->drv->bdrv_co_invalidate_cache(bs, &local_err);
   6306            if (local_err) {
   6307                bs->open_flags |= BDRV_O_INACTIVE;
   6308                error_propagate(errp, local_err);
   6309                return -EINVAL;
   6310            }
   6311        }
   6312
   6313        FOR_EACH_DIRTY_BITMAP(bs, bm) {
   6314            bdrv_dirty_bitmap_skip_store(bm, false);
   6315        }
   6316
   6317        ret = refresh_total_sectors(bs, bs->total_sectors);
   6318        if (ret < 0) {
   6319            bs->open_flags |= BDRV_O_INACTIVE;
   6320            error_setg_errno(errp, -ret, "Could not refresh total sector count");
   6321            return ret;
   6322        }
   6323    }
   6324
   6325    QLIST_FOREACH(parent, &bs->parents, next_parent) {
   6326        if (parent->klass->activate) {
   6327            parent->klass->activate(parent, &local_err);
   6328            if (local_err) {
   6329                bs->open_flags |= BDRV_O_INACTIVE;
   6330                error_propagate(errp, local_err);
   6331                return -EINVAL;
   6332            }
   6333        }
   6334    }
   6335
   6336    return 0;
   6337}
   6338
   6339void bdrv_invalidate_cache_all(Error **errp)
   6340{
   6341    BlockDriverState *bs;
   6342    BdrvNextIterator it;
   6343
   6344    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
   6345        AioContext *aio_context = bdrv_get_aio_context(bs);
   6346        int ret;
   6347
   6348        aio_context_acquire(aio_context);
   6349        ret = bdrv_invalidate_cache(bs, errp);
   6350        aio_context_release(aio_context);
   6351        if (ret < 0) {
   6352            bdrv_next_cleanup(&it);
   6353            return;
   6354        }
   6355    }
   6356}
   6357
   6358static bool bdrv_has_bds_parent(BlockDriverState *bs, bool only_active)
   6359{
   6360    BdrvChild *parent;
   6361
   6362    QLIST_FOREACH(parent, &bs->parents, next_parent) {
   6363        if (parent->klass->parent_is_bds) {
   6364            BlockDriverState *parent_bs = parent->opaque;
   6365            if (!only_active || !(parent_bs->open_flags & BDRV_O_INACTIVE)) {
   6366                return true;
   6367            }
   6368        }
   6369    }
   6370
   6371    return false;
   6372}
   6373
   6374static int bdrv_inactivate_recurse(BlockDriverState *bs)
   6375{
   6376    BdrvChild *child, *parent;
   6377    int ret;
   6378    uint64_t cumulative_perms, cumulative_shared_perms;
   6379
   6380    if (!bs->drv) {
   6381        return -ENOMEDIUM;
   6382    }
   6383
   6384    /* Make sure that we don't inactivate a child before its parent.
   6385     * It will be covered by recursion from the yet active parent. */
   6386    if (bdrv_has_bds_parent(bs, true)) {
   6387        return 0;
   6388    }
   6389
   6390    assert(!(bs->open_flags & BDRV_O_INACTIVE));
   6391
   6392    /* Inactivate this node */
   6393    if (bs->drv->bdrv_inactivate) {
   6394        ret = bs->drv->bdrv_inactivate(bs);
   6395        if (ret < 0) {
   6396            return ret;
   6397        }
   6398    }
   6399
   6400    QLIST_FOREACH(parent, &bs->parents, next_parent) {
   6401        if (parent->klass->inactivate) {
   6402            ret = parent->klass->inactivate(parent);
   6403            if (ret < 0) {
   6404                return ret;
   6405            }
   6406        }
   6407    }
   6408
   6409    bdrv_get_cumulative_perm(bs, &cumulative_perms,
   6410                             &cumulative_shared_perms);
   6411    if (cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) {
   6412        /* Our inactive parents still need write access. Inactivation failed. */
   6413        return -EPERM;
   6414    }
   6415
   6416    bs->open_flags |= BDRV_O_INACTIVE;
   6417
   6418    /*
   6419     * Update permissions, they may differ for inactive nodes.
   6420     * We only tried to loosen restrictions, so errors are not fatal, ignore
   6421     * them.
   6422     */
   6423    bdrv_refresh_perms(bs, NULL);
   6424
   6425    /* Recursively inactivate children */
   6426    QLIST_FOREACH(child, &bs->children, next) {
   6427        ret = bdrv_inactivate_recurse(child->bs);
   6428        if (ret < 0) {
   6429            return ret;
   6430        }
   6431    }
   6432
   6433    return 0;
   6434}
   6435
   6436int bdrv_inactivate_all(void)
   6437{
   6438    BlockDriverState *bs = NULL;
   6439    BdrvNextIterator it;
   6440    int ret = 0;
   6441    GSList *aio_ctxs = NULL, *ctx;
   6442
   6443    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
   6444        AioContext *aio_context = bdrv_get_aio_context(bs);
   6445
   6446        if (!g_slist_find(aio_ctxs, aio_context)) {
   6447            aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
   6448            aio_context_acquire(aio_context);
   6449        }
   6450    }
   6451
   6452    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
   6453        /* Nodes with BDS parents are covered by recursion from the last
   6454         * parent that gets inactivated. Don't inactivate them a second
   6455         * time if that has already happened. */
   6456        if (bdrv_has_bds_parent(bs, false)) {
   6457            continue;
   6458        }
   6459        ret = bdrv_inactivate_recurse(bs);
   6460        if (ret < 0) {
   6461            bdrv_next_cleanup(&it);
   6462            goto out;
   6463        }
   6464    }
   6465
   6466out:
   6467    for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
   6468        AioContext *aio_context = ctx->data;
   6469        aio_context_release(aio_context);
   6470    }
   6471    g_slist_free(aio_ctxs);
   6472
   6473    return ret;
   6474}
   6475
   6476/**************************************************************/
   6477/* removable device support */
   6478
   6479/**
   6480 * Return TRUE if the media is present
   6481 */
   6482bool bdrv_is_inserted(BlockDriverState *bs)
   6483{
   6484    BlockDriver *drv = bs->drv;
   6485    BdrvChild *child;
   6486
   6487    if (!drv) {
   6488        return false;
   6489    }
   6490    if (drv->bdrv_is_inserted) {
   6491        return drv->bdrv_is_inserted(bs);
   6492    }
   6493    QLIST_FOREACH(child, &bs->children, next) {
   6494        if (!bdrv_is_inserted(child->bs)) {
   6495            return false;
   6496        }
   6497    }
   6498    return true;
   6499}
   6500
   6501/**
   6502 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
   6503 */
   6504void bdrv_eject(BlockDriverState *bs, bool eject_flag)
   6505{
   6506    BlockDriver *drv = bs->drv;
   6507
   6508    if (drv && drv->bdrv_eject) {
   6509        drv->bdrv_eject(bs, eject_flag);
   6510    }
   6511}
   6512
   6513/**
   6514 * Lock or unlock the media (if it is locked, the user won't be able
   6515 * to eject it manually).
   6516 */
   6517void bdrv_lock_medium(BlockDriverState *bs, bool locked)
   6518{
   6519    BlockDriver *drv = bs->drv;
   6520
   6521    trace_bdrv_lock_medium(bs, locked);
   6522
   6523    if (drv && drv->bdrv_lock_medium) {
   6524        drv->bdrv_lock_medium(bs, locked);
   6525    }
   6526}
   6527
   6528/* Get a reference to bs */
   6529void bdrv_ref(BlockDriverState *bs)
   6530{
   6531    bs->refcnt++;
   6532}
   6533
   6534/* Release a previously grabbed reference to bs.
   6535 * If after releasing, reference count is zero, the BlockDriverState is
   6536 * deleted. */
   6537void bdrv_unref(BlockDriverState *bs)
   6538{
   6539    if (!bs) {
   6540        return;
   6541    }
   6542    assert(bs->refcnt > 0);
   6543    if (--bs->refcnt == 0) {
   6544        bdrv_delete(bs);
   6545    }
   6546}
   6547
   6548struct BdrvOpBlocker {
   6549    Error *reason;
   6550    QLIST_ENTRY(BdrvOpBlocker) list;
   6551};
   6552
   6553bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
   6554{
   6555    BdrvOpBlocker *blocker;
   6556    assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
   6557    if (!QLIST_EMPTY(&bs->op_blockers[op])) {
   6558        blocker = QLIST_FIRST(&bs->op_blockers[op]);
   6559        error_propagate_prepend(errp, error_copy(blocker->reason),
   6560                                "Node '%s' is busy: ",
   6561                                bdrv_get_device_or_node_name(bs));
   6562        return true;
   6563    }
   6564    return false;
   6565}
   6566
   6567void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
   6568{
   6569    BdrvOpBlocker *blocker;
   6570    assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
   6571
   6572    blocker = g_new0(BdrvOpBlocker, 1);
   6573    blocker->reason = reason;
   6574    QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
   6575}
   6576
   6577void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
   6578{
   6579    BdrvOpBlocker *blocker, *next;
   6580    assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
   6581    QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
   6582        if (blocker->reason == reason) {
   6583            QLIST_REMOVE(blocker, list);
   6584            g_free(blocker);
   6585        }
   6586    }
   6587}
   6588
   6589void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
   6590{
   6591    int i;
   6592    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
   6593        bdrv_op_block(bs, i, reason);
   6594    }
   6595}
   6596
   6597void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
   6598{
   6599    int i;
   6600    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
   6601        bdrv_op_unblock(bs, i, reason);
   6602    }
   6603}
   6604
   6605bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
   6606{
   6607    int i;
   6608
   6609    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
   6610        if (!QLIST_EMPTY(&bs->op_blockers[i])) {
   6611            return false;
   6612        }
   6613    }
   6614    return true;
   6615}
   6616
   6617void bdrv_img_create(const char *filename, const char *fmt,
   6618                     const char *base_filename, const char *base_fmt,
   6619                     char *options, uint64_t img_size, int flags, bool quiet,
   6620                     Error **errp)
   6621{
   6622    QemuOptsList *create_opts = NULL;
   6623    QemuOpts *opts = NULL;
   6624    const char *backing_fmt, *backing_file;
   6625    int64_t size;
   6626    BlockDriver *drv, *proto_drv;
   6627    Error *local_err = NULL;
   6628    int ret = 0;
   6629
   6630    /* Find driver and parse its options */
   6631    drv = bdrv_find_format(fmt);
   6632    if (!drv) {
   6633        error_setg(errp, "Unknown file format '%s'", fmt);
   6634        return;
   6635    }
   6636
   6637    proto_drv = bdrv_find_protocol(filename, true, errp);
   6638    if (!proto_drv) {
   6639        return;
   6640    }
   6641
   6642    if (!drv->create_opts) {
   6643        error_setg(errp, "Format driver '%s' does not support image creation",
   6644                   drv->format_name);
   6645        return;
   6646    }
   6647
   6648    if (!proto_drv->create_opts) {
   6649        error_setg(errp, "Protocol driver '%s' does not support image creation",
   6650                   proto_drv->format_name);
   6651        return;
   6652    }
   6653
   6654    /* Create parameter list */
   6655    create_opts = qemu_opts_append(create_opts, drv->create_opts);
   6656    create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
   6657
   6658    opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
   6659
   6660    /* Parse -o options */
   6661    if (options) {
   6662        if (!qemu_opts_do_parse(opts, options, NULL, errp)) {
   6663            goto out;
   6664        }
   6665    }
   6666
   6667    if (!qemu_opt_get(opts, BLOCK_OPT_SIZE)) {
   6668        qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort);
   6669    } else if (img_size != UINT64_C(-1)) {
   6670        error_setg(errp, "The image size must be specified only once");
   6671        goto out;
   6672    }
   6673
   6674    if (base_filename) {
   6675        if (!qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename,
   6676                          NULL)) {
   6677            error_setg(errp, "Backing file not supported for file format '%s'",
   6678                       fmt);
   6679            goto out;
   6680        }
   6681    }
   6682
   6683    if (base_fmt) {
   6684        if (!qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt, NULL)) {
   6685            error_setg(errp, "Backing file format not supported for file "
   6686                             "format '%s'", fmt);
   6687            goto out;
   6688        }
   6689    }
   6690
   6691    backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
   6692    if (backing_file) {
   6693        if (!strcmp(filename, backing_file)) {
   6694            error_setg(errp, "Error: Trying to create an image with the "
   6695                             "same filename as the backing file");
   6696            goto out;
   6697        }
   6698        if (backing_file[0] == '\0') {
   6699            error_setg(errp, "Expected backing file name, got empty string");
   6700            goto out;
   6701        }
   6702    }
   6703
   6704    backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
   6705
   6706    /* The size for the image must always be specified, unless we have a backing
   6707     * file and we have not been forbidden from opening it. */
   6708    size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, img_size);
   6709    if (backing_file && !(flags & BDRV_O_NO_BACKING)) {
   6710        BlockDriverState *bs;
   6711        char *full_backing;
   6712        int back_flags;
   6713        QDict *backing_options = NULL;
   6714
   6715        full_backing =
   6716            bdrv_get_full_backing_filename_from_filename(filename, backing_file,
   6717                                                         &local_err);
   6718        if (local_err) {
   6719            goto out;
   6720        }
   6721        assert(full_backing);
   6722
   6723        /*
   6724         * No need to do I/O here, which allows us to open encrypted
   6725         * backing images without needing the secret
   6726         */
   6727        back_flags = flags;
   6728        back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
   6729        back_flags |= BDRV_O_NO_IO;
   6730
   6731        backing_options = qdict_new();
   6732        if (backing_fmt) {
   6733            qdict_put_str(backing_options, "driver", backing_fmt);
   6734        }
   6735        qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
   6736
   6737        bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
   6738                       &local_err);
   6739        g_free(full_backing);
   6740        if (!bs) {
   6741            error_append_hint(&local_err, "Could not open backing image.\n");
   6742            goto out;
   6743        } else {
   6744            if (!backing_fmt) {
   6745                error_setg(&local_err,
   6746                           "Backing file specified without backing format");
   6747                error_append_hint(&local_err, "Detected format of %s.",
   6748                                  bs->drv->format_name);
   6749                goto out;
   6750            }
   6751            if (size == -1) {
   6752                /* Opened BS, have no size */
   6753                size = bdrv_getlength(bs);
   6754                if (size < 0) {
   6755                    error_setg_errno(errp, -size, "Could not get size of '%s'",
   6756                                     backing_file);
   6757                    bdrv_unref(bs);
   6758                    goto out;
   6759                }
   6760                qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
   6761            }
   6762            bdrv_unref(bs);
   6763        }
   6764        /* (backing_file && !(flags & BDRV_O_NO_BACKING)) */
   6765    } else if (backing_file && !backing_fmt) {
   6766        error_setg(&local_err,
   6767                   "Backing file specified without backing format");
   6768        goto out;
   6769    }
   6770
   6771    if (size == -1) {
   6772        error_setg(errp, "Image creation needs a size parameter");
   6773        goto out;
   6774    }
   6775
   6776    if (!quiet) {
   6777        printf("Formatting '%s', fmt=%s ", filename, fmt);
   6778        qemu_opts_print(opts, " ");
   6779        puts("");
   6780        fflush(stdout);
   6781    }
   6782
   6783    ret = bdrv_create(drv, filename, opts, &local_err);
   6784
   6785    if (ret == -EFBIG) {
   6786        /* This is generally a better message than whatever the driver would
   6787         * deliver (especially because of the cluster_size_hint), since that
   6788         * is most probably not much different from "image too large". */
   6789        const char *cluster_size_hint = "";
   6790        if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
   6791            cluster_size_hint = " (try using a larger cluster size)";
   6792        }
   6793        error_setg(errp, "The image size is too large for file format '%s'"
   6794                   "%s", fmt, cluster_size_hint);
   6795        error_free(local_err);
   6796        local_err = NULL;
   6797    }
   6798
   6799out:
   6800    qemu_opts_del(opts);
   6801    qemu_opts_free(create_opts);
   6802    error_propagate(errp, local_err);
   6803}
   6804
   6805AioContext *bdrv_get_aio_context(BlockDriverState *bs)
   6806{
   6807    return bs ? bs->aio_context : qemu_get_aio_context();
   6808}
   6809
   6810AioContext *coroutine_fn bdrv_co_enter(BlockDriverState *bs)
   6811{
   6812    Coroutine *self = qemu_coroutine_self();
   6813    AioContext *old_ctx = qemu_coroutine_get_aio_context(self);
   6814    AioContext *new_ctx;
   6815
   6816    /*
   6817     * Increase bs->in_flight to ensure that this operation is completed before
   6818     * moving the node to a different AioContext. Read new_ctx only afterwards.
   6819     */
   6820    bdrv_inc_in_flight(bs);
   6821
   6822    new_ctx = bdrv_get_aio_context(bs);
   6823    aio_co_reschedule_self(new_ctx);
   6824    return old_ctx;
   6825}
   6826
   6827void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx)
   6828{
   6829    aio_co_reschedule_self(old_ctx);
   6830    bdrv_dec_in_flight(bs);
   6831}
   6832
   6833void coroutine_fn bdrv_co_lock(BlockDriverState *bs)
   6834{
   6835    AioContext *ctx = bdrv_get_aio_context(bs);
   6836
   6837    /* In the main thread, bs->aio_context won't change concurrently */
   6838    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
   6839
   6840    /*
   6841     * We're in coroutine context, so we already hold the lock of the main
   6842     * loop AioContext. Don't lock it twice to avoid deadlocks.
   6843     */
   6844    assert(qemu_in_coroutine());
   6845    if (ctx != qemu_get_aio_context()) {
   6846        aio_context_acquire(ctx);
   6847    }
   6848}
   6849
   6850void coroutine_fn bdrv_co_unlock(BlockDriverState *bs)
   6851{
   6852    AioContext *ctx = bdrv_get_aio_context(bs);
   6853
   6854    assert(qemu_in_coroutine());
   6855    if (ctx != qemu_get_aio_context()) {
   6856        aio_context_release(ctx);
   6857    }
   6858}
   6859
   6860void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co)
   6861{
   6862    aio_co_enter(bdrv_get_aio_context(bs), co);
   6863}
   6864
   6865static void bdrv_do_remove_aio_context_notifier(BdrvAioNotifier *ban)
   6866{
   6867    QLIST_REMOVE(ban, list);
   6868    g_free(ban);
   6869}
   6870
   6871static void bdrv_detach_aio_context(BlockDriverState *bs)
   6872{
   6873    BdrvAioNotifier *baf, *baf_tmp;
   6874
   6875    assert(!bs->walking_aio_notifiers);
   6876    bs->walking_aio_notifiers = true;
   6877    QLIST_FOREACH_SAFE(baf, &bs->aio_notifiers, list, baf_tmp) {
   6878        if (baf->deleted) {
   6879            bdrv_do_remove_aio_context_notifier(baf);
   6880        } else {
   6881            baf->detach_aio_context(baf->opaque);
   6882        }
   6883    }
   6884    /* Never mind iterating again to check for ->deleted.  bdrv_close() will
   6885     * remove remaining aio notifiers if we aren't called again.
   6886     */
   6887    bs->walking_aio_notifiers = false;
   6888
   6889    if (bs->drv && bs->drv->bdrv_detach_aio_context) {
   6890        bs->drv->bdrv_detach_aio_context(bs);
   6891    }
   6892
   6893    if (bs->quiesce_counter) {
   6894        aio_enable_external(bs->aio_context);
   6895    }
   6896    bs->aio_context = NULL;
   6897}
   6898
   6899static void bdrv_attach_aio_context(BlockDriverState *bs,
   6900                                    AioContext *new_context)
   6901{
   6902    BdrvAioNotifier *ban, *ban_tmp;
   6903
   6904    if (bs->quiesce_counter) {
   6905        aio_disable_external(new_context);
   6906    }
   6907
   6908    bs->aio_context = new_context;
   6909
   6910    if (bs->drv && bs->drv->bdrv_attach_aio_context) {
   6911        bs->drv->bdrv_attach_aio_context(bs, new_context);
   6912    }
   6913
   6914    assert(!bs->walking_aio_notifiers);
   6915    bs->walking_aio_notifiers = true;
   6916    QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_tmp) {
   6917        if (ban->deleted) {
   6918            bdrv_do_remove_aio_context_notifier(ban);
   6919        } else {
   6920            ban->attached_aio_context(new_context, ban->opaque);
   6921        }
   6922    }
   6923    bs->walking_aio_notifiers = false;
   6924}
   6925
   6926/*
   6927 * Changes the AioContext used for fd handlers, timers, and BHs by this
   6928 * BlockDriverState and all its children and parents.
   6929 *
   6930 * Must be called from the main AioContext.
   6931 *
   6932 * The caller must own the AioContext lock for the old AioContext of bs, but it
   6933 * must not own the AioContext lock for new_context (unless new_context is the
   6934 * same as the current context of bs).
   6935 *
   6936 * @ignore will accumulate all visited BdrvChild object. The caller is
   6937 * responsible for freeing the list afterwards.
   6938 */
   6939void bdrv_set_aio_context_ignore(BlockDriverState *bs,
   6940                                 AioContext *new_context, GSList **ignore)
   6941{
   6942    AioContext *old_context = bdrv_get_aio_context(bs);
   6943    GSList *children_to_process = NULL;
   6944    GSList *parents_to_process = NULL;
   6945    GSList *entry;
   6946    BdrvChild *child, *parent;
   6947
   6948    g_assert(qemu_get_current_aio_context() == qemu_get_aio_context());
   6949
   6950    if (old_context == new_context) {
   6951        return;
   6952    }
   6953
   6954    bdrv_drained_begin(bs);
   6955
   6956    QLIST_FOREACH(child, &bs->children, next) {
   6957        if (g_slist_find(*ignore, child)) {
   6958            continue;
   6959        }
   6960        *ignore = g_slist_prepend(*ignore, child);
   6961        children_to_process = g_slist_prepend(children_to_process, child);
   6962    }
   6963
   6964    QLIST_FOREACH(parent, &bs->parents, next_parent) {
   6965        if (g_slist_find(*ignore, parent)) {
   6966            continue;
   6967        }
   6968        *ignore = g_slist_prepend(*ignore, parent);
   6969        parents_to_process = g_slist_prepend(parents_to_process, parent);
   6970    }
   6971
   6972    for (entry = children_to_process;
   6973         entry != NULL;
   6974         entry = g_slist_next(entry)) {
   6975        child = entry->data;
   6976        bdrv_set_aio_context_ignore(child->bs, new_context, ignore);
   6977    }
   6978    g_slist_free(children_to_process);
   6979
   6980    for (entry = parents_to_process;
   6981         entry != NULL;
   6982         entry = g_slist_next(entry)) {
   6983        parent = entry->data;
   6984        assert(parent->klass->set_aio_ctx);
   6985        parent->klass->set_aio_ctx(parent, new_context, ignore);
   6986    }
   6987    g_slist_free(parents_to_process);
   6988
   6989    bdrv_detach_aio_context(bs);
   6990
   6991    /* Acquire the new context, if necessary */
   6992    if (qemu_get_aio_context() != new_context) {
   6993        aio_context_acquire(new_context);
   6994    }
   6995
   6996    bdrv_attach_aio_context(bs, new_context);
   6997
   6998    /*
   6999     * If this function was recursively called from
   7000     * bdrv_set_aio_context_ignore(), there may be nodes in the
   7001     * subtree that have not yet been moved to the new AioContext.
   7002     * Release the old one so bdrv_drained_end() can poll them.
   7003     */
   7004    if (qemu_get_aio_context() != old_context) {
   7005        aio_context_release(old_context);
   7006    }
   7007
   7008    bdrv_drained_end(bs);
   7009
   7010    if (qemu_get_aio_context() != old_context) {
   7011        aio_context_acquire(old_context);
   7012    }
   7013    if (qemu_get_aio_context() != new_context) {
   7014        aio_context_release(new_context);
   7015    }
   7016}
   7017
   7018static bool bdrv_parent_can_set_aio_context(BdrvChild *c, AioContext *ctx,
   7019                                            GSList **ignore, Error **errp)
   7020{
   7021    if (g_slist_find(*ignore, c)) {
   7022        return true;
   7023    }
   7024    *ignore = g_slist_prepend(*ignore, c);
   7025
   7026    /*
   7027     * A BdrvChildClass that doesn't handle AioContext changes cannot
   7028     * tolerate any AioContext changes
   7029     */
   7030    if (!c->klass->can_set_aio_ctx) {
   7031        char *user = bdrv_child_user_desc(c);
   7032        error_setg(errp, "Changing iothreads is not supported by %s", user);
   7033        g_free(user);
   7034        return false;
   7035    }
   7036    if (!c->klass->can_set_aio_ctx(c, ctx, ignore, errp)) {
   7037        assert(!errp || *errp);
   7038        return false;
   7039    }
   7040    return true;
   7041}
   7042
   7043bool bdrv_child_can_set_aio_context(BdrvChild *c, AioContext *ctx,
   7044                                    GSList **ignore, Error **errp)
   7045{
   7046    if (g_slist_find(*ignore, c)) {
   7047        return true;
   7048    }
   7049    *ignore = g_slist_prepend(*ignore, c);
   7050    return bdrv_can_set_aio_context(c->bs, ctx, ignore, errp);
   7051}
   7052
   7053/* @ignore will accumulate all visited BdrvChild object. The caller is
   7054 * responsible for freeing the list afterwards. */
   7055bool bdrv_can_set_aio_context(BlockDriverState *bs, AioContext *ctx,
   7056                              GSList **ignore, Error **errp)
   7057{
   7058    BdrvChild *c;
   7059
   7060    if (bdrv_get_aio_context(bs) == ctx) {
   7061        return true;
   7062    }
   7063
   7064    QLIST_FOREACH(c, &bs->parents, next_parent) {
   7065        if (!bdrv_parent_can_set_aio_context(c, ctx, ignore, errp)) {
   7066            return false;
   7067        }
   7068    }
   7069    QLIST_FOREACH(c, &bs->children, next) {
   7070        if (!bdrv_child_can_set_aio_context(c, ctx, ignore, errp)) {
   7071            return false;
   7072        }
   7073    }
   7074
   7075    return true;
   7076}
   7077
   7078int bdrv_child_try_set_aio_context(BlockDriverState *bs, AioContext *ctx,
   7079                                   BdrvChild *ignore_child, Error **errp)
   7080{
   7081    GSList *ignore;
   7082    bool ret;
   7083
   7084    ignore = ignore_child ? g_slist_prepend(NULL, ignore_child) : NULL;
   7085    ret = bdrv_can_set_aio_context(bs, ctx, &ignore, errp);
   7086    g_slist_free(ignore);
   7087
   7088    if (!ret) {
   7089        return -EPERM;
   7090    }
   7091
   7092    ignore = ignore_child ? g_slist_prepend(NULL, ignore_child) : NULL;
   7093    bdrv_set_aio_context_ignore(bs, ctx, &ignore);
   7094    g_slist_free(ignore);
   7095
   7096    return 0;
   7097}
   7098
   7099int bdrv_try_set_aio_context(BlockDriverState *bs, AioContext *ctx,
   7100                             Error **errp)
   7101{
   7102    return bdrv_child_try_set_aio_context(bs, ctx, NULL, errp);
   7103}
   7104
   7105void bdrv_add_aio_context_notifier(BlockDriverState *bs,
   7106        void (*attached_aio_context)(AioContext *new_context, void *opaque),
   7107        void (*detach_aio_context)(void *opaque), void *opaque)
   7108{
   7109    BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
   7110    *ban = (BdrvAioNotifier){
   7111        .attached_aio_context = attached_aio_context,
   7112        .detach_aio_context   = detach_aio_context,
   7113        .opaque               = opaque
   7114    };
   7115
   7116    QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
   7117}
   7118
   7119void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
   7120                                      void (*attached_aio_context)(AioContext *,
   7121                                                                   void *),
   7122                                      void (*detach_aio_context)(void *),
   7123                                      void *opaque)
   7124{
   7125    BdrvAioNotifier *ban, *ban_next;
   7126
   7127    QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
   7128        if (ban->attached_aio_context == attached_aio_context &&
   7129            ban->detach_aio_context   == detach_aio_context   &&
   7130            ban->opaque               == opaque               &&
   7131            ban->deleted              == false)
   7132        {
   7133            if (bs->walking_aio_notifiers) {
   7134                ban->deleted = true;
   7135            } else {
   7136                bdrv_do_remove_aio_context_notifier(ban);
   7137            }
   7138            return;
   7139        }
   7140    }
   7141
   7142    abort();
   7143}
   7144
   7145int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
   7146                       BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
   7147                       bool force,
   7148                       Error **errp)
   7149{
   7150    if (!bs->drv) {
   7151        error_setg(errp, "Node is ejected");
   7152        return -ENOMEDIUM;
   7153    }
   7154    if (!bs->drv->bdrv_amend_options) {
   7155        error_setg(errp, "Block driver '%s' does not support option amendment",
   7156                   bs->drv->format_name);
   7157        return -ENOTSUP;
   7158    }
   7159    return bs->drv->bdrv_amend_options(bs, opts, status_cb,
   7160                                       cb_opaque, force, errp);
   7161}
   7162
   7163/*
   7164 * This function checks whether the given @to_replace is allowed to be
   7165 * replaced by a node that always shows the same data as @bs.  This is
   7166 * used for example to verify whether the mirror job can replace
   7167 * @to_replace by the target mirrored from @bs.
   7168 * To be replaceable, @bs and @to_replace may either be guaranteed to
   7169 * always show the same data (because they are only connected through
   7170 * filters), or some driver may allow replacing one of its children
   7171 * because it can guarantee that this child's data is not visible at
   7172 * all (for example, for dissenting quorum children that have no other
   7173 * parents).
   7174 */
   7175bool bdrv_recurse_can_replace(BlockDriverState *bs,
   7176                              BlockDriverState *to_replace)
   7177{
   7178    BlockDriverState *filtered;
   7179
   7180    if (!bs || !bs->drv) {
   7181        return false;
   7182    }
   7183
   7184    if (bs == to_replace) {
   7185        return true;
   7186    }
   7187
   7188    /* See what the driver can do */
   7189    if (bs->drv->bdrv_recurse_can_replace) {
   7190        return bs->drv->bdrv_recurse_can_replace(bs, to_replace);
   7191    }
   7192
   7193    /* For filters without an own implementation, we can recurse on our own */
   7194    filtered = bdrv_filter_bs(bs);
   7195    if (filtered) {
   7196        return bdrv_recurse_can_replace(filtered, to_replace);
   7197    }
   7198
   7199    /* Safe default */
   7200    return false;
   7201}
   7202
   7203/*
   7204 * Check whether the given @node_name can be replaced by a node that
   7205 * has the same data as @parent_bs.  If so, return @node_name's BDS;
   7206 * NULL otherwise.
   7207 *
   7208 * @node_name must be a (recursive) *child of @parent_bs (or this
   7209 * function will return NULL).
   7210 *
   7211 * The result (whether the node can be replaced or not) is only valid
   7212 * for as long as no graph or permission changes occur.
   7213 */
   7214BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
   7215                                        const char *node_name, Error **errp)
   7216{
   7217    BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
   7218    AioContext *aio_context;
   7219
   7220    if (!to_replace_bs) {
   7221        error_setg(errp, "Failed to find node with node-name='%s'", node_name);
   7222        return NULL;
   7223    }
   7224
   7225    aio_context = bdrv_get_aio_context(to_replace_bs);
   7226    aio_context_acquire(aio_context);
   7227
   7228    if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
   7229        to_replace_bs = NULL;
   7230        goto out;
   7231    }
   7232
   7233    /* We don't want arbitrary node of the BDS chain to be replaced only the top
   7234     * most non filter in order to prevent data corruption.
   7235     * Another benefit is that this tests exclude backing files which are
   7236     * blocked by the backing blockers.
   7237     */
   7238    if (!bdrv_recurse_can_replace(parent_bs, to_replace_bs)) {
   7239        error_setg(errp, "Cannot replace '%s' by a node mirrored from '%s', "
   7240                   "because it cannot be guaranteed that doing so would not "
   7241                   "lead to an abrupt change of visible data",
   7242                   node_name, parent_bs->node_name);
   7243        to_replace_bs = NULL;
   7244        goto out;
   7245    }
   7246
   7247out:
   7248    aio_context_release(aio_context);
   7249    return to_replace_bs;
   7250}
   7251
   7252/**
   7253 * Iterates through the list of runtime option keys that are said to
   7254 * be "strong" for a BDS.  An option is called "strong" if it changes
   7255 * a BDS's data.  For example, the null block driver's "size" and
   7256 * "read-zeroes" options are strong, but its "latency-ns" option is
   7257 * not.
   7258 *
   7259 * If a key returned by this function ends with a dot, all options
   7260 * starting with that prefix are strong.
   7261 */
   7262static const char *const *strong_options(BlockDriverState *bs,
   7263                                         const char *const *curopt)
   7264{
   7265    static const char *const global_options[] = {
   7266        "driver", "filename", NULL
   7267    };
   7268
   7269    if (!curopt) {
   7270        return &global_options[0];
   7271    }
   7272
   7273    curopt++;
   7274    if (curopt == &global_options[ARRAY_SIZE(global_options) - 1] && bs->drv) {
   7275        curopt = bs->drv->strong_runtime_opts;
   7276    }
   7277
   7278    return (curopt && *curopt) ? curopt : NULL;
   7279}
   7280
   7281/**
   7282 * Copies all strong runtime options from bs->options to the given
   7283 * QDict.  The set of strong option keys is determined by invoking
   7284 * strong_options().
   7285 *
   7286 * Returns true iff any strong option was present in bs->options (and
   7287 * thus copied to the target QDict) with the exception of "filename"
   7288 * and "driver".  The caller is expected to use this value to decide
   7289 * whether the existence of strong options prevents the generation of
   7290 * a plain filename.
   7291 */
   7292static bool append_strong_runtime_options(QDict *d, BlockDriverState *bs)
   7293{
   7294    bool found_any = false;
   7295    const char *const *option_name = NULL;
   7296
   7297    if (!bs->drv) {
   7298        return false;
   7299    }
   7300
   7301    while ((option_name = strong_options(bs, option_name))) {
   7302        bool option_given = false;
   7303
   7304        assert(strlen(*option_name) > 0);
   7305        if ((*option_name)[strlen(*option_name) - 1] != '.') {
   7306            QObject *entry = qdict_get(bs->options, *option_name);
   7307            if (!entry) {
   7308                continue;
   7309            }
   7310
   7311            qdict_put_obj(d, *option_name, qobject_ref(entry));
   7312            option_given = true;
   7313        } else {
   7314            const QDictEntry *entry;
   7315            for (entry = qdict_first(bs->options); entry;
   7316                 entry = qdict_next(bs->options, entry))
   7317            {
   7318                if (strstart(qdict_entry_key(entry), *option_name, NULL)) {
   7319                    qdict_put_obj(d, qdict_entry_key(entry),
   7320                                  qobject_ref(qdict_entry_value(entry)));
   7321                    option_given = true;
   7322                }
   7323            }
   7324        }
   7325
   7326        /* While "driver" and "filename" need to be included in a JSON filename,
   7327         * their existence does not prohibit generation of a plain filename. */
   7328        if (!found_any && option_given &&
   7329            strcmp(*option_name, "driver") && strcmp(*option_name, "filename"))
   7330        {
   7331            found_any = true;
   7332        }
   7333    }
   7334
   7335    if (!qdict_haskey(d, "driver")) {
   7336        /* Drivers created with bdrv_new_open_driver() may not have a
   7337         * @driver option.  Add it here. */
   7338        qdict_put_str(d, "driver", bs->drv->format_name);
   7339    }
   7340
   7341    return found_any;
   7342}
   7343
   7344/* Note: This function may return false positives; it may return true
   7345 * even if opening the backing file specified by bs's image header
   7346 * would result in exactly bs->backing. */
   7347bool bdrv_backing_overridden(BlockDriverState *bs)
   7348{
   7349    if (bs->backing) {
   7350        return strcmp(bs->auto_backing_file,
   7351                      bs->backing->bs->filename);
   7352    } else {
   7353        /* No backing BDS, so if the image header reports any backing
   7354         * file, it must have been suppressed */
   7355        return bs->auto_backing_file[0] != '\0';
   7356    }
   7357}
   7358
   7359/* Updates the following BDS fields:
   7360 *  - exact_filename: A filename which may be used for opening a block device
   7361 *                    which (mostly) equals the given BDS (even without any
   7362 *                    other options; so reading and writing must return the same
   7363 *                    results, but caching etc. may be different)
   7364 *  - full_open_options: Options which, when given when opening a block device
   7365 *                       (without a filename), result in a BDS (mostly)
   7366 *                       equalling the given one
   7367 *  - filename: If exact_filename is set, it is copied here. Otherwise,
   7368 *              full_open_options is converted to a JSON object, prefixed with
   7369 *              "json:" (for use through the JSON pseudo protocol) and put here.
   7370 */
   7371void bdrv_refresh_filename(BlockDriverState *bs)
   7372{
   7373    BlockDriver *drv = bs->drv;
   7374    BdrvChild *child;
   7375    BlockDriverState *primary_child_bs;
   7376    QDict *opts;
   7377    bool backing_overridden;
   7378    bool generate_json_filename; /* Whether our default implementation should
   7379                                    fill exact_filename (false) or not (true) */
   7380
   7381    if (!drv) {
   7382        return;
   7383    }
   7384
   7385    /* This BDS's file name may depend on any of its children's file names, so
   7386     * refresh those first */
   7387    QLIST_FOREACH(child, &bs->children, next) {
   7388        bdrv_refresh_filename(child->bs);
   7389    }
   7390
   7391    if (bs->implicit) {
   7392        /* For implicit nodes, just copy everything from the single child */
   7393        child = QLIST_FIRST(&bs->children);
   7394        assert(QLIST_NEXT(child, next) == NULL);
   7395
   7396        pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
   7397                child->bs->exact_filename);
   7398        pstrcpy(bs->filename, sizeof(bs->filename), child->bs->filename);
   7399
   7400        qobject_unref(bs->full_open_options);
   7401        bs->full_open_options = qobject_ref(child->bs->full_open_options);
   7402
   7403        return;
   7404    }
   7405
   7406    backing_overridden = bdrv_backing_overridden(bs);
   7407
   7408    if (bs->open_flags & BDRV_O_NO_IO) {
   7409        /* Without I/O, the backing file does not change anything.
   7410         * Therefore, in such a case (primarily qemu-img), we can
   7411         * pretend the backing file has not been overridden even if
   7412         * it technically has been. */
   7413        backing_overridden = false;
   7414    }
   7415
   7416    /* Gather the options QDict */
   7417    opts = qdict_new();
   7418    generate_json_filename = append_strong_runtime_options(opts, bs);
   7419    generate_json_filename |= backing_overridden;
   7420
   7421    if (drv->bdrv_gather_child_options) {
   7422        /* Some block drivers may not want to present all of their children's
   7423         * options, or name them differently from BdrvChild.name */
   7424        drv->bdrv_gather_child_options(bs, opts, backing_overridden);
   7425    } else {
   7426        QLIST_FOREACH(child, &bs->children, next) {
   7427            if (child == bs->backing && !backing_overridden) {
   7428                /* We can skip the backing BDS if it has not been overridden */
   7429                continue;
   7430            }
   7431
   7432            qdict_put(opts, child->name,
   7433                      qobject_ref(child->bs->full_open_options));
   7434        }
   7435
   7436        if (backing_overridden && !bs->backing) {
   7437            /* Force no backing file */
   7438            qdict_put_null(opts, "backing");
   7439        }
   7440    }
   7441
   7442    qobject_unref(bs->full_open_options);
   7443    bs->full_open_options = opts;
   7444
   7445    primary_child_bs = bdrv_primary_bs(bs);
   7446
   7447    if (drv->bdrv_refresh_filename) {
   7448        /* Obsolete information is of no use here, so drop the old file name
   7449         * information before refreshing it */
   7450        bs->exact_filename[0] = '\0';
   7451
   7452        drv->bdrv_refresh_filename(bs);
   7453    } else if (primary_child_bs) {
   7454        /*
   7455         * Try to reconstruct valid information from the underlying
   7456         * file -- this only works for format nodes (filter nodes
   7457         * cannot be probed and as such must be selected by the user
   7458         * either through an options dict, or through a special
   7459         * filename which the filter driver must construct in its
   7460         * .bdrv_refresh_filename() implementation).
   7461         */
   7462
   7463        bs->exact_filename[0] = '\0';
   7464
   7465        /*
   7466         * We can use the underlying file's filename if:
   7467         * - it has a filename,
   7468         * - the current BDS is not a filter,
   7469         * - the file is a protocol BDS, and
   7470         * - opening that file (as this BDS's format) will automatically create
   7471         *   the BDS tree we have right now, that is:
   7472         *   - the user did not significantly change this BDS's behavior with
   7473         *     some explicit (strong) options
   7474         *   - no non-file child of this BDS has been overridden by the user
   7475         *   Both of these conditions are represented by generate_json_filename.
   7476         */
   7477        if (primary_child_bs->exact_filename[0] &&
   7478            primary_child_bs->drv->bdrv_file_open &&
   7479            !drv->is_filter && !generate_json_filename)
   7480        {
   7481            strcpy(bs->exact_filename, primary_child_bs->exact_filename);
   7482        }
   7483    }
   7484
   7485    if (bs->exact_filename[0]) {
   7486        pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
   7487    } else {
   7488        GString *json = qobject_to_json(QOBJECT(bs->full_open_options));
   7489        if (snprintf(bs->filename, sizeof(bs->filename), "json:%s",
   7490                     json->str) >= sizeof(bs->filename)) {
   7491            /* Give user a hint if we truncated things. */
   7492            strcpy(bs->filename + sizeof(bs->filename) - 4, "...");
   7493        }
   7494        g_string_free(json, true);
   7495    }
   7496}
   7497
   7498char *bdrv_dirname(BlockDriverState *bs, Error **errp)
   7499{
   7500    BlockDriver *drv = bs->drv;
   7501    BlockDriverState *child_bs;
   7502
   7503    if (!drv) {
   7504        error_setg(errp, "Node '%s' is ejected", bs->node_name);
   7505        return NULL;
   7506    }
   7507
   7508    if (drv->bdrv_dirname) {
   7509        return drv->bdrv_dirname(bs, errp);
   7510    }
   7511
   7512    child_bs = bdrv_primary_bs(bs);
   7513    if (child_bs) {
   7514        return bdrv_dirname(child_bs, errp);
   7515    }
   7516
   7517    bdrv_refresh_filename(bs);
   7518    if (bs->exact_filename[0] != '\0') {
   7519        return path_combine(bs->exact_filename, "");
   7520    }
   7521
   7522    error_setg(errp, "Cannot generate a base directory for %s nodes",
   7523               drv->format_name);
   7524    return NULL;
   7525}
   7526
   7527/*
   7528 * Hot add/remove a BDS's child. So the user can take a child offline when
   7529 * it is broken and take a new child online
   7530 */
   7531void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs,
   7532                    Error **errp)
   7533{
   7534
   7535    if (!parent_bs->drv || !parent_bs->drv->bdrv_add_child) {
   7536        error_setg(errp, "The node %s does not support adding a child",
   7537                   bdrv_get_device_or_node_name(parent_bs));
   7538        return;
   7539    }
   7540
   7541    if (!QLIST_EMPTY(&child_bs->parents)) {
   7542        error_setg(errp, "The node %s already has a parent",
   7543                   child_bs->node_name);
   7544        return;
   7545    }
   7546
   7547    parent_bs->drv->bdrv_add_child(parent_bs, child_bs, errp);
   7548}
   7549
   7550void bdrv_del_child(BlockDriverState *parent_bs, BdrvChild *child, Error **errp)
   7551{
   7552    BdrvChild *tmp;
   7553
   7554    if (!parent_bs->drv || !parent_bs->drv->bdrv_del_child) {
   7555        error_setg(errp, "The node %s does not support removing a child",
   7556                   bdrv_get_device_or_node_name(parent_bs));
   7557        return;
   7558    }
   7559
   7560    QLIST_FOREACH(tmp, &parent_bs->children, next) {
   7561        if (tmp == child) {
   7562            break;
   7563        }
   7564    }
   7565
   7566    if (!tmp) {
   7567        error_setg(errp, "The node %s does not have a child named %s",
   7568                   bdrv_get_device_or_node_name(parent_bs),
   7569                   bdrv_get_device_or_node_name(child->bs));
   7570        return;
   7571    }
   7572
   7573    parent_bs->drv->bdrv_del_child(parent_bs, child, errp);
   7574}
   7575
   7576int bdrv_make_empty(BdrvChild *c, Error **errp)
   7577{
   7578    BlockDriver *drv = c->bs->drv;
   7579    int ret;
   7580
   7581    assert(c->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED));
   7582
   7583    if (!drv->bdrv_make_empty) {
   7584        error_setg(errp, "%s does not support emptying nodes",
   7585                   drv->format_name);
   7586        return -ENOTSUP;
   7587    }
   7588
   7589    ret = drv->bdrv_make_empty(c->bs);
   7590    if (ret < 0) {
   7591        error_setg_errno(errp, -ret, "Failed to empty %s",
   7592                         c->bs->filename);
   7593        return ret;
   7594    }
   7595
   7596    return 0;
   7597}
   7598
   7599/*
   7600 * Return the child that @bs acts as an overlay for, and from which data may be
   7601 * copied in COW or COR operations.  Usually this is the backing file.
   7602 */
   7603BdrvChild *bdrv_cow_child(BlockDriverState *bs)
   7604{
   7605    if (!bs || !bs->drv) {
   7606        return NULL;
   7607    }
   7608
   7609    if (bs->drv->is_filter) {
   7610        return NULL;
   7611    }
   7612
   7613    if (!bs->backing) {
   7614        return NULL;
   7615    }
   7616
   7617    assert(bs->backing->role & BDRV_CHILD_COW);
   7618    return bs->backing;
   7619}
   7620
   7621/*
   7622 * If @bs acts as a filter for exactly one of its children, return
   7623 * that child.
   7624 */
   7625BdrvChild *bdrv_filter_child(BlockDriverState *bs)
   7626{
   7627    BdrvChild *c;
   7628
   7629    if (!bs || !bs->drv) {
   7630        return NULL;
   7631    }
   7632
   7633    if (!bs->drv->is_filter) {
   7634        return NULL;
   7635    }
   7636
   7637    /* Only one of @backing or @file may be used */
   7638    assert(!(bs->backing && bs->file));
   7639
   7640    c = bs->backing ?: bs->file;
   7641    if (!c) {
   7642        return NULL;
   7643    }
   7644
   7645    assert(c->role & BDRV_CHILD_FILTERED);
   7646    return c;
   7647}
   7648
   7649/*
   7650 * Return either the result of bdrv_cow_child() or bdrv_filter_child(),
   7651 * whichever is non-NULL.
   7652 *
   7653 * Return NULL if both are NULL.
   7654 */
   7655BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs)
   7656{
   7657    BdrvChild *cow_child = bdrv_cow_child(bs);
   7658    BdrvChild *filter_child = bdrv_filter_child(bs);
   7659
   7660    /* Filter nodes cannot have COW backing files */
   7661    assert(!(cow_child && filter_child));
   7662
   7663    return cow_child ?: filter_child;
   7664}
   7665
   7666/*
   7667 * Return the primary child of this node: For filters, that is the
   7668 * filtered child.  For other nodes, that is usually the child storing
   7669 * metadata.
   7670 * (A generally more helpful description is that this is (usually) the
   7671 * child that has the same filename as @bs.)
   7672 *
   7673 * Drivers do not necessarily have a primary child; for example quorum
   7674 * does not.
   7675 */
   7676BdrvChild *bdrv_primary_child(BlockDriverState *bs)
   7677{
   7678    BdrvChild *c, *found = NULL;
   7679
   7680    QLIST_FOREACH(c, &bs->children, next) {
   7681        if (c->role & BDRV_CHILD_PRIMARY) {
   7682            assert(!found);
   7683            found = c;
   7684        }
   7685    }
   7686
   7687    return found;
   7688}
   7689
   7690static BlockDriverState *bdrv_do_skip_filters(BlockDriverState *bs,
   7691                                              bool stop_on_explicit_filter)
   7692{
   7693    BdrvChild *c;
   7694
   7695    if (!bs) {
   7696        return NULL;
   7697    }
   7698
   7699    while (!(stop_on_explicit_filter && !bs->implicit)) {
   7700        c = bdrv_filter_child(bs);
   7701        if (!c) {
   7702            /*
   7703             * A filter that is embedded in a working block graph must
   7704             * have a child.  Assert this here so this function does
   7705             * not return a filter node that is not expected by the
   7706             * caller.
   7707             */
   7708            assert(!bs->drv || !bs->drv->is_filter);
   7709            break;
   7710        }
   7711        bs = c->bs;
   7712    }
   7713    /*
   7714     * Note that this treats nodes with bs->drv == NULL as not being
   7715     * filters (bs->drv == NULL should be replaced by something else
   7716     * anyway).
   7717     * The advantage of this behavior is that this function will thus
   7718     * always return a non-NULL value (given a non-NULL @bs).
   7719     */
   7720
   7721    return bs;
   7722}
   7723
   7724/*
   7725 * Return the first BDS that has not been added implicitly or that
   7726 * does not have a filtered child down the chain starting from @bs
   7727 * (including @bs itself).
   7728 */
   7729BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs)
   7730{
   7731    return bdrv_do_skip_filters(bs, true);
   7732}
   7733
   7734/*
   7735 * Return the first BDS that does not have a filtered child down the
   7736 * chain starting from @bs (including @bs itself).
   7737 */
   7738BlockDriverState *bdrv_skip_filters(BlockDriverState *bs)
   7739{
   7740    return bdrv_do_skip_filters(bs, false);
   7741}
   7742
   7743/*
   7744 * For a backing chain, return the first non-filter backing image of
   7745 * the first non-filter image.
   7746 */
   7747BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs)
   7748{
   7749    return bdrv_skip_filters(bdrv_cow_bs(bdrv_skip_filters(bs)));
   7750}
   7751
   7752/**
   7753 * Check whether [offset, offset + bytes) overlaps with the cached
   7754 * block-status data region.
   7755 *
   7756 * If so, and @pnum is not NULL, set *pnum to `bsc.data_end - offset`,
   7757 * which is what bdrv_bsc_is_data()'s interface needs.
   7758 * Otherwise, *pnum is not touched.
   7759 */
   7760static bool bdrv_bsc_range_overlaps_locked(BlockDriverState *bs,
   7761                                           int64_t offset, int64_t bytes,
   7762                                           int64_t *pnum)
   7763{
   7764    BdrvBlockStatusCache *bsc = qatomic_rcu_read(&bs->block_status_cache);
   7765    bool overlaps;
   7766
   7767    overlaps =
   7768        qatomic_read(&bsc->valid) &&
   7769        ranges_overlap(offset, bytes, bsc->data_start,
   7770                       bsc->data_end - bsc->data_start);
   7771
   7772    if (overlaps && pnum) {
   7773        *pnum = bsc->data_end - offset;
   7774    }
   7775
   7776    return overlaps;
   7777}
   7778
   7779/**
   7780 * See block_int.h for this function's documentation.
   7781 */
   7782bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum)
   7783{
   7784    RCU_READ_LOCK_GUARD();
   7785
   7786    return bdrv_bsc_range_overlaps_locked(bs, offset, 1, pnum);
   7787}
   7788
   7789/**
   7790 * See block_int.h for this function's documentation.
   7791 */
   7792void bdrv_bsc_invalidate_range(BlockDriverState *bs,
   7793                               int64_t offset, int64_t bytes)
   7794{
   7795    RCU_READ_LOCK_GUARD();
   7796
   7797    if (bdrv_bsc_range_overlaps_locked(bs, offset, bytes, NULL)) {
   7798        qatomic_set(&bs->block_status_cache->valid, false);
   7799    }
   7800}
   7801
   7802/**
   7803 * See block_int.h for this function's documentation.
   7804 */
   7805void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes)
   7806{
   7807    BdrvBlockStatusCache *new_bsc = g_new(BdrvBlockStatusCache, 1);
   7808    BdrvBlockStatusCache *old_bsc;
   7809
   7810    *new_bsc = (BdrvBlockStatusCache) {
   7811        .valid = true,
   7812        .data_start = offset,
   7813        .data_end = offset + bytes,
   7814    };
   7815
   7816    QEMU_LOCK_GUARD(&bs->bsc_modify_lock);
   7817
   7818    old_bsc = qatomic_rcu_read(&bs->block_status_cache);
   7819    qatomic_rcu_set(&bs->block_status_cache, new_bsc);
   7820    if (old_bsc) {
   7821        g_free_rcu(old_bsc, rcu);
   7822    }
   7823}