cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

file-posix.c (115933B)


      1/*
      2 * Block driver for RAW files (posix)
      3 *
      4 * Copyright (c) 2006 Fabrice Bellard
      5 *
      6 * Permission is hereby granted, free of charge, to any person obtaining a copy
      7 * of this software and associated documentation files (the "Software"), to deal
      8 * in the Software without restriction, including without limitation the rights
      9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10 * copies of the Software, and to permit persons to whom the Software is
     11 * furnished to do so, subject to the following conditions:
     12 *
     13 * The above copyright notice and this permission notice shall be included in
     14 * all copies or substantial portions of the Software.
     15 *
     16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22 * THE SOFTWARE.
     23 */
     24
     25#include "qemu/osdep.h"
     26#include "qemu-common.h"
     27#include "qapi/error.h"
     28#include "qemu/cutils.h"
     29#include "qemu/error-report.h"
     30#include "block/block_int.h"
     31#include "qemu/module.h"
     32#include "qemu/option.h"
     33#include "qemu/units.h"
     34#include "trace.h"
     35#include "block/thread-pool.h"
     36#include "qemu/iov.h"
     37#include "block/raw-aio.h"
     38#include "qapi/qmp/qdict.h"
     39#include "qapi/qmp/qstring.h"
     40
     41#include "scsi/pr-manager.h"
     42#include "scsi/constants.h"
     43
     44#if defined(__APPLE__) && (__MACH__)
     45#include <sys/ioctl.h>
     46#if defined(HAVE_HOST_BLOCK_DEVICE)
     47#include <paths.h>
     48#include <sys/param.h>
     49#include <sys/mount.h>
     50#include <IOKit/IOKitLib.h>
     51#include <IOKit/IOBSD.h>
     52#include <IOKit/storage/IOMediaBSDClient.h>
     53#include <IOKit/storage/IOMedia.h>
     54#include <IOKit/storage/IOCDMedia.h>
     55//#include <IOKit/storage/IOCDTypes.h>
     56#include <IOKit/storage/IODVDMedia.h>
     57#include <CoreFoundation/CoreFoundation.h>
     58#endif /* defined(HAVE_HOST_BLOCK_DEVICE) */
     59#endif
     60
     61#ifdef __sun__
     62#define _POSIX_PTHREAD_SEMANTICS 1
     63#include <sys/dkio.h>
     64#endif
     65#ifdef __linux__
     66#include <sys/ioctl.h>
     67#include <sys/param.h>
     68#include <sys/syscall.h>
     69#include <sys/vfs.h>
     70#include <linux/cdrom.h>
     71#include <linux/fd.h>
     72#include <linux/fs.h>
     73#include <linux/hdreg.h>
     74#include <linux/magic.h>
     75#include <scsi/sg.h>
     76#ifdef __s390__
     77#include <asm/dasd.h>
     78#endif
     79#ifndef FS_NOCOW_FL
     80#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
     81#endif
     82#endif
     83#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
     84#include <linux/falloc.h>
     85#endif
     86#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
     87#include <sys/disk.h>
     88#include <sys/cdio.h>
     89#endif
     90
     91#ifdef __OpenBSD__
     92#include <sys/ioctl.h>
     93#include <sys/disklabel.h>
     94#include <sys/dkio.h>
     95#endif
     96
     97#ifdef __NetBSD__
     98#include <sys/ioctl.h>
     99#include <sys/disklabel.h>
    100#include <sys/dkio.h>
    101#include <sys/disk.h>
    102#endif
    103
    104#ifdef __DragonFly__
    105#include <sys/ioctl.h>
    106#include <sys/diskslice.h>
    107#endif
    108
    109#ifdef CONFIG_XFS
    110#include <xfs/xfs.h>
    111#endif
    112
    113/* OS X does not have O_DSYNC */
    114#ifndef O_DSYNC
    115#ifdef O_SYNC
    116#define O_DSYNC O_SYNC
    117#elif defined(O_FSYNC)
    118#define O_DSYNC O_FSYNC
    119#endif
    120#endif
    121
    122/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
    123#ifndef O_DIRECT
    124#define O_DIRECT O_DSYNC
    125#endif
    126
    127#define FTYPE_FILE   0
    128#define FTYPE_CD     1
    129
    130#define MAX_BLOCKSIZE	4096
    131
    132/* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
    133 * leaving a few more bytes for its future use. */
    134#define RAW_LOCK_PERM_BASE             100
    135#define RAW_LOCK_SHARED_BASE           200
    136
    137typedef struct BDRVRawState {
    138    int fd;
    139    bool use_lock;
    140    int type;
    141    int open_flags;
    142    size_t buf_align;
    143
    144    /* The current permissions. */
    145    uint64_t perm;
    146    uint64_t shared_perm;
    147
    148    /* The perms bits whose corresponding bytes are already locked in
    149     * s->fd. */
    150    uint64_t locked_perm;
    151    uint64_t locked_shared_perm;
    152
    153    int perm_change_fd;
    154    int perm_change_flags;
    155    BDRVReopenState *reopen_state;
    156
    157#ifdef CONFIG_XFS
    158    bool is_xfs:1;
    159#endif
    160    bool has_discard:1;
    161    bool has_write_zeroes:1;
    162    bool discard_zeroes:1;
    163    bool use_linux_aio:1;
    164    bool use_linux_io_uring:1;
    165    int page_cache_inconsistent; /* errno from fdatasync failure */
    166    bool has_fallocate;
    167    bool needs_alignment;
    168    bool drop_cache;
    169    bool check_cache_dropped;
    170    struct {
    171        uint64_t discard_nb_ok;
    172        uint64_t discard_nb_failed;
    173        uint64_t discard_bytes_ok;
    174    } stats;
    175
    176    PRManager *pr_mgr;
    177} BDRVRawState;
    178
    179typedef struct BDRVRawReopenState {
    180    int open_flags;
    181    bool drop_cache;
    182    bool check_cache_dropped;
    183} BDRVRawReopenState;
    184
    185static int fd_open(BlockDriverState *bs)
    186{
    187    BDRVRawState *s = bs->opaque;
    188
    189    /* this is just to ensure s->fd is sane (its called by io ops) */
    190    if (s->fd >= 0) {
    191        return 0;
    192    }
    193    return -EIO;
    194}
    195
    196static int64_t raw_getlength(BlockDriverState *bs);
    197
    198typedef struct RawPosixAIOData {
    199    BlockDriverState *bs;
    200    int aio_type;
    201    int aio_fildes;
    202
    203    off_t aio_offset;
    204    uint64_t aio_nbytes;
    205
    206    union {
    207        struct {
    208            struct iovec *iov;
    209            int niov;
    210        } io;
    211        struct {
    212            uint64_t cmd;
    213            void *buf;
    214        } ioctl;
    215        struct {
    216            int aio_fd2;
    217            off_t aio_offset2;
    218        } copy_range;
    219        struct {
    220            PreallocMode prealloc;
    221            Error **errp;
    222        } truncate;
    223    };
    224} RawPosixAIOData;
    225
    226#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
    227static int cdrom_reopen(BlockDriverState *bs);
    228#endif
    229
    230/*
    231 * Elide EAGAIN and EACCES details when failing to lock, as this
    232 * indicates that the specified file region is already locked by
    233 * another process, which is considered a common scenario.
    234 */
    235#define raw_lock_error_setg_errno(errp, err, fmt, ...)                  \
    236    do {                                                                \
    237        if ((err) == EAGAIN || (err) == EACCES) {                       \
    238            error_setg((errp), (fmt), ## __VA_ARGS__);                  \
    239        } else {                                                        \
    240            error_setg_errno((errp), (err), (fmt), ## __VA_ARGS__);     \
    241        }                                                               \
    242    } while (0)
    243
    244#if defined(__NetBSD__)
    245static int raw_normalize_devicepath(const char **filename, Error **errp)
    246{
    247    static char namebuf[PATH_MAX];
    248    const char *dp, *fname;
    249    struct stat sb;
    250
    251    fname = *filename;
    252    dp = strrchr(fname, '/');
    253    if (lstat(fname, &sb) < 0) {
    254        error_setg_file_open(errp, errno, fname);
    255        return -errno;
    256    }
    257
    258    if (!S_ISBLK(sb.st_mode)) {
    259        return 0;
    260    }
    261
    262    if (dp == NULL) {
    263        snprintf(namebuf, PATH_MAX, "r%s", fname);
    264    } else {
    265        snprintf(namebuf, PATH_MAX, "%.*s/r%s",
    266            (int)(dp - fname), fname, dp + 1);
    267    }
    268    *filename = namebuf;
    269    warn_report("%s is a block device, using %s", fname, *filename);
    270
    271    return 0;
    272}
    273#else
    274static int raw_normalize_devicepath(const char **filename, Error **errp)
    275{
    276    return 0;
    277}
    278#endif
    279
    280/*
    281 * Get logical block size via ioctl. On success store it in @sector_size_p.
    282 */
    283static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
    284{
    285    unsigned int sector_size;
    286    bool success = false;
    287    int i;
    288
    289    errno = ENOTSUP;
    290    static const unsigned long ioctl_list[] = {
    291#ifdef BLKSSZGET
    292        BLKSSZGET,
    293#endif
    294#ifdef DKIOCGETBLOCKSIZE
    295        DKIOCGETBLOCKSIZE,
    296#endif
    297#ifdef DIOCGSECTORSIZE
    298        DIOCGSECTORSIZE,
    299#endif
    300    };
    301
    302    /* Try a few ioctls to get the right size */
    303    for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
    304        if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
    305            *sector_size_p = sector_size;
    306            success = true;
    307        }
    308    }
    309
    310    return success ? 0 : -errno;
    311}
    312
    313/**
    314 * Get physical block size of @fd.
    315 * On success, store it in @blk_size and return 0.
    316 * On failure, return -errno.
    317 */
    318static int probe_physical_blocksize(int fd, unsigned int *blk_size)
    319{
    320#ifdef BLKPBSZGET
    321    if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
    322        return -errno;
    323    }
    324    return 0;
    325#else
    326    return -ENOTSUP;
    327#endif
    328}
    329
    330/*
    331 * Returns true if no alignment restrictions are necessary even for files
    332 * opened with O_DIRECT.
    333 *
    334 * raw_probe_alignment() probes the required alignment and assume that 1 means
    335 * the probing failed, so it falls back to a safe default of 4k. This can be
    336 * avoided if we know that byte alignment is okay for the file.
    337 */
    338static bool dio_byte_aligned(int fd)
    339{
    340#ifdef __linux__
    341    struct statfs buf;
    342    int ret;
    343
    344    ret = fstatfs(fd, &buf);
    345    if (ret == 0 && buf.f_type == NFS_SUPER_MAGIC) {
    346        return true;
    347    }
    348#endif
    349    return false;
    350}
    351
    352/* Check if read is allowed with given memory buffer and length.
    353 *
    354 * This function is used to check O_DIRECT memory buffer and request alignment.
    355 */
    356static bool raw_is_io_aligned(int fd, void *buf, size_t len)
    357{
    358    ssize_t ret = pread(fd, buf, len, 0);
    359
    360    if (ret >= 0) {
    361        return true;
    362    }
    363
    364#ifdef __linux__
    365    /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
    366     * other errors (e.g. real I/O error), which could happen on a failed
    367     * drive, since we only care about probing alignment.
    368     */
    369    if (errno != EINVAL) {
    370        return true;
    371    }
    372#endif
    373
    374    return false;
    375}
    376
    377static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
    378{
    379    BDRVRawState *s = bs->opaque;
    380    char *buf;
    381    size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size);
    382    size_t alignments[] = {1, 512, 1024, 2048, 4096};
    383
    384    /* For SCSI generic devices the alignment is not really used.
    385       With buffered I/O, we don't have any restrictions. */
    386    if (bdrv_is_sg(bs) || !s->needs_alignment) {
    387        bs->bl.request_alignment = 1;
    388        s->buf_align = 1;
    389        return;
    390    }
    391
    392    bs->bl.request_alignment = 0;
    393    s->buf_align = 0;
    394    /* Let's try to use the logical blocksize for the alignment. */
    395    if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
    396        bs->bl.request_alignment = 0;
    397    }
    398#ifdef CONFIG_XFS
    399    if (s->is_xfs) {
    400        struct dioattr da;
    401        if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
    402            bs->bl.request_alignment = da.d_miniosz;
    403            /* The kernel returns wrong information for d_mem */
    404            /* s->buf_align = da.d_mem; */
    405        }
    406    }
    407#endif
    408
    409    /*
    410     * If we could not get the sizes so far, we can only guess them. First try
    411     * to detect request alignment, since it is more likely to succeed. Then
    412     * try to detect buf_align, which cannot be detected in some cases (e.g.
    413     * Gluster). If buf_align cannot be detected, we fallback to the value of
    414     * request_alignment.
    415     */
    416
    417    if (!bs->bl.request_alignment) {
    418        int i;
    419        size_t align;
    420        buf = qemu_memalign(max_align, max_align);
    421        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
    422            align = alignments[i];
    423            if (raw_is_io_aligned(fd, buf, align)) {
    424                /* Fallback to safe value. */
    425                bs->bl.request_alignment = (align != 1) ? align : max_align;
    426                break;
    427            }
    428        }
    429        qemu_vfree(buf);
    430    }
    431
    432    if (!s->buf_align) {
    433        int i;
    434        size_t align;
    435        buf = qemu_memalign(max_align, 2 * max_align);
    436        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
    437            align = alignments[i];
    438            if (raw_is_io_aligned(fd, buf + align, max_align)) {
    439                /* Fallback to request_alignment. */
    440                s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
    441                break;
    442            }
    443        }
    444        qemu_vfree(buf);
    445    }
    446
    447    if (!s->buf_align || !bs->bl.request_alignment) {
    448        error_setg(errp, "Could not find working O_DIRECT alignment");
    449        error_append_hint(errp, "Try cache.direct=off\n");
    450    }
    451}
    452
    453static int check_hdev_writable(int fd)
    454{
    455#if defined(BLKROGET)
    456    /* Linux block devices can be configured "read-only" using blockdev(8).
    457     * This is independent of device node permissions and therefore open(2)
    458     * with O_RDWR succeeds.  Actual writes fail with EPERM.
    459     *
    460     * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
    461     * check for read-only block devices so that Linux block devices behave
    462     * properly.
    463     */
    464    struct stat st;
    465    int readonly = 0;
    466
    467    if (fstat(fd, &st)) {
    468        return -errno;
    469    }
    470
    471    if (!S_ISBLK(st.st_mode)) {
    472        return 0;
    473    }
    474
    475    if (ioctl(fd, BLKROGET, &readonly) < 0) {
    476        return -errno;
    477    }
    478
    479    if (readonly) {
    480        return -EACCES;
    481    }
    482#endif /* defined(BLKROGET) */
    483    return 0;
    484}
    485
    486static void raw_parse_flags(int bdrv_flags, int *open_flags, bool has_writers)
    487{
    488    bool read_write = false;
    489    assert(open_flags != NULL);
    490
    491    *open_flags |= O_BINARY;
    492    *open_flags &= ~O_ACCMODE;
    493
    494    if (bdrv_flags & BDRV_O_AUTO_RDONLY) {
    495        read_write = has_writers;
    496    } else if (bdrv_flags & BDRV_O_RDWR) {
    497        read_write = true;
    498    }
    499
    500    if (read_write) {
    501        *open_flags |= O_RDWR;
    502    } else {
    503        *open_flags |= O_RDONLY;
    504    }
    505
    506    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
    507     * and O_DIRECT for no caching. */
    508    if ((bdrv_flags & BDRV_O_NOCACHE)) {
    509        *open_flags |= O_DIRECT;
    510    }
    511}
    512
    513static void raw_parse_filename(const char *filename, QDict *options,
    514                               Error **errp)
    515{
    516    bdrv_parse_filename_strip_prefix(filename, "file:", options);
    517}
    518
    519static QemuOptsList raw_runtime_opts = {
    520    .name = "raw",
    521    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
    522    .desc = {
    523        {
    524            .name = "filename",
    525            .type = QEMU_OPT_STRING,
    526            .help = "File name of the image",
    527        },
    528        {
    529            .name = "aio",
    530            .type = QEMU_OPT_STRING,
    531            .help = "host AIO implementation (threads, native, io_uring)",
    532        },
    533        {
    534            .name = "locking",
    535            .type = QEMU_OPT_STRING,
    536            .help = "file locking mode (on/off/auto, default: auto)",
    537        },
    538        {
    539            .name = "pr-manager",
    540            .type = QEMU_OPT_STRING,
    541            .help = "id of persistent reservation manager object (default: none)",
    542        },
    543#if defined(__linux__)
    544        {
    545            .name = "drop-cache",
    546            .type = QEMU_OPT_BOOL,
    547            .help = "invalidate page cache during live migration (default: on)",
    548        },
    549#endif
    550        {
    551            .name = "x-check-cache-dropped",
    552            .type = QEMU_OPT_BOOL,
    553            .help = "check that page cache was dropped on live migration (default: off)"
    554        },
    555        { /* end of list */ }
    556    },
    557};
    558
    559static const char *const mutable_opts[] = { "x-check-cache-dropped", NULL };
    560
    561static int raw_open_common(BlockDriverState *bs, QDict *options,
    562                           int bdrv_flags, int open_flags,
    563                           bool device, Error **errp)
    564{
    565    BDRVRawState *s = bs->opaque;
    566    QemuOpts *opts;
    567    Error *local_err = NULL;
    568    const char *filename = NULL;
    569    const char *str;
    570    BlockdevAioOptions aio, aio_default;
    571    int fd, ret;
    572    struct stat st;
    573    OnOffAuto locking;
    574
    575    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
    576    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
    577        ret = -EINVAL;
    578        goto fail;
    579    }
    580
    581    filename = qemu_opt_get(opts, "filename");
    582
    583    ret = raw_normalize_devicepath(&filename, errp);
    584    if (ret != 0) {
    585        goto fail;
    586    }
    587
    588    if (bdrv_flags & BDRV_O_NATIVE_AIO) {
    589        aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE;
    590#ifdef CONFIG_LINUX_IO_URING
    591    } else if (bdrv_flags & BDRV_O_IO_URING) {
    592        aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING;
    593#endif
    594    } else {
    595        aio_default = BLOCKDEV_AIO_OPTIONS_THREADS;
    596    }
    597
    598    aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
    599                          qemu_opt_get(opts, "aio"),
    600                          aio_default, &local_err);
    601    if (local_err) {
    602        error_propagate(errp, local_err);
    603        ret = -EINVAL;
    604        goto fail;
    605    }
    606
    607    s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
    608#ifdef CONFIG_LINUX_IO_URING
    609    s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
    610#endif
    611
    612    locking = qapi_enum_parse(&OnOffAuto_lookup,
    613                              qemu_opt_get(opts, "locking"),
    614                              ON_OFF_AUTO_AUTO, &local_err);
    615    if (local_err) {
    616        error_propagate(errp, local_err);
    617        ret = -EINVAL;
    618        goto fail;
    619    }
    620    switch (locking) {
    621    case ON_OFF_AUTO_ON:
    622        s->use_lock = true;
    623        if (!qemu_has_ofd_lock()) {
    624            warn_report("File lock requested but OFD locking syscall is "
    625                        "unavailable, falling back to POSIX file locks");
    626            error_printf("Due to the implementation, locks can be lost "
    627                         "unexpectedly.\n");
    628        }
    629        break;
    630    case ON_OFF_AUTO_OFF:
    631        s->use_lock = false;
    632        break;
    633    case ON_OFF_AUTO_AUTO:
    634        s->use_lock = qemu_has_ofd_lock();
    635        break;
    636    default:
    637        abort();
    638    }
    639
    640    str = qemu_opt_get(opts, "pr-manager");
    641    if (str) {
    642        s->pr_mgr = pr_manager_lookup(str, &local_err);
    643        if (local_err) {
    644            error_propagate(errp, local_err);
    645            ret = -EINVAL;
    646            goto fail;
    647        }
    648    }
    649
    650    s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
    651    s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
    652                                               false);
    653
    654    s->open_flags = open_flags;
    655    raw_parse_flags(bdrv_flags, &s->open_flags, false);
    656
    657    s->fd = -1;
    658    fd = qemu_open(filename, s->open_flags, errp);
    659    ret = fd < 0 ? -errno : 0;
    660
    661    if (ret < 0) {
    662        if (ret == -EROFS) {
    663            ret = -EACCES;
    664        }
    665        goto fail;
    666    }
    667    s->fd = fd;
    668
    669    /* Check s->open_flags rather than bdrv_flags due to auto-read-only */
    670    if (s->open_flags & O_RDWR) {
    671        ret = check_hdev_writable(s->fd);
    672        if (ret < 0) {
    673            error_setg_errno(errp, -ret, "The device is not writable");
    674            goto fail;
    675        }
    676    }
    677
    678    s->perm = 0;
    679    s->shared_perm = BLK_PERM_ALL;
    680
    681#ifdef CONFIG_LINUX_AIO
    682     /* Currently Linux does AIO only for files opened with O_DIRECT */
    683    if (s->use_linux_aio) {
    684        if (!(s->open_flags & O_DIRECT)) {
    685            error_setg(errp, "aio=native was specified, but it requires "
    686                             "cache.direct=on, which was not specified.");
    687            ret = -EINVAL;
    688            goto fail;
    689        }
    690        if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
    691            error_prepend(errp, "Unable to use native AIO: ");
    692            goto fail;
    693        }
    694    }
    695#else
    696    if (s->use_linux_aio) {
    697        error_setg(errp, "aio=native was specified, but is not supported "
    698                         "in this build.");
    699        ret = -EINVAL;
    700        goto fail;
    701    }
    702#endif /* !defined(CONFIG_LINUX_AIO) */
    703
    704#ifdef CONFIG_LINUX_IO_URING
    705    if (s->use_linux_io_uring) {
    706        if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) {
    707            error_prepend(errp, "Unable to use io_uring: ");
    708            goto fail;
    709        }
    710    }
    711#else
    712    if (s->use_linux_io_uring) {
    713        error_setg(errp, "aio=io_uring was specified, but is not supported "
    714                         "in this build.");
    715        ret = -EINVAL;
    716        goto fail;
    717    }
    718#endif /* !defined(CONFIG_LINUX_IO_URING) */
    719
    720    s->has_discard = true;
    721    s->has_write_zeroes = true;
    722    if ((bs->open_flags & BDRV_O_NOCACHE) != 0 && !dio_byte_aligned(s->fd)) {
    723        s->needs_alignment = true;
    724    }
    725
    726    if (fstat(s->fd, &st) < 0) {
    727        ret = -errno;
    728        error_setg_errno(errp, errno, "Could not stat file");
    729        goto fail;
    730    }
    731
    732    if (!device) {
    733        if (!S_ISREG(st.st_mode)) {
    734            error_setg(errp, "'%s' driver requires '%s' to be a regular file",
    735                       bs->drv->format_name, bs->filename);
    736            ret = -EINVAL;
    737            goto fail;
    738        } else {
    739            s->discard_zeroes = true;
    740            s->has_fallocate = true;
    741        }
    742    } else {
    743        if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
    744            error_setg(errp, "'%s' driver requires '%s' to be either "
    745                       "a character or block device",
    746                       bs->drv->format_name, bs->filename);
    747            ret = -EINVAL;
    748            goto fail;
    749        }
    750    }
    751
    752    if (S_ISBLK(st.st_mode)) {
    753#ifdef BLKDISCARDZEROES
    754        unsigned int arg;
    755        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
    756            s->discard_zeroes = true;
    757        }
    758#endif
    759#ifdef __linux__
    760        /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
    761         * not rely on the contents of discarded blocks unless using O_DIRECT.
    762         * Same for BLKZEROOUT.
    763         */
    764        if (!(bs->open_flags & BDRV_O_NOCACHE)) {
    765            s->discard_zeroes = false;
    766            s->has_write_zeroes = false;
    767        }
    768#endif
    769    }
    770#ifdef __FreeBSD__
    771    if (S_ISCHR(st.st_mode)) {
    772        /*
    773         * The file is a char device (disk), which on FreeBSD isn't behind
    774         * a pager, so force all requests to be aligned. This is needed
    775         * so QEMU makes sure all IO operations on the device are aligned
    776         * to sector size, or else FreeBSD will reject them with EINVAL.
    777         */
    778        s->needs_alignment = true;
    779    }
    780#endif
    781
    782#ifdef CONFIG_XFS
    783    if (platform_test_xfs_fd(s->fd)) {
    784        s->is_xfs = true;
    785    }
    786#endif
    787
    788    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
    789    if (S_ISREG(st.st_mode)) {
    790        /* When extending regular files, we get zeros from the OS */
    791        bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
    792    }
    793    ret = 0;
    794fail:
    795    if (ret < 0 && s->fd != -1) {
    796        qemu_close(s->fd);
    797    }
    798    if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
    799        unlink(filename);
    800    }
    801    qemu_opts_del(opts);
    802    return ret;
    803}
    804
    805static int raw_open(BlockDriverState *bs, QDict *options, int flags,
    806                    Error **errp)
    807{
    808    BDRVRawState *s = bs->opaque;
    809
    810    s->type = FTYPE_FILE;
    811    return raw_open_common(bs, options, flags, 0, false, errp);
    812}
    813
    814typedef enum {
    815    RAW_PL_PREPARE,
    816    RAW_PL_COMMIT,
    817    RAW_PL_ABORT,
    818} RawPermLockOp;
    819
    820#define PERM_FOREACH(i) \
    821    for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)
    822
    823/* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
    824 * file; if @unlock == true, also unlock the unneeded bytes.
    825 * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
    826 */
    827static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
    828                                uint64_t perm_lock_bits,
    829                                uint64_t shared_perm_lock_bits,
    830                                bool unlock, Error **errp)
    831{
    832    int ret;
    833    int i;
    834    uint64_t locked_perm, locked_shared_perm;
    835
    836    if (s) {
    837        locked_perm = s->locked_perm;
    838        locked_shared_perm = s->locked_shared_perm;
    839    } else {
    840        /*
    841         * We don't have the previous bits, just lock/unlock for each of the
    842         * requested bits.
    843         */
    844        if (unlock) {
    845            locked_perm = BLK_PERM_ALL;
    846            locked_shared_perm = BLK_PERM_ALL;
    847        } else {
    848            locked_perm = 0;
    849            locked_shared_perm = 0;
    850        }
    851    }
    852
    853    PERM_FOREACH(i) {
    854        int off = RAW_LOCK_PERM_BASE + i;
    855        uint64_t bit = (1ULL << i);
    856        if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
    857            ret = qemu_lock_fd(fd, off, 1, false);
    858            if (ret) {
    859                raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
    860                                          off);
    861                return ret;
    862            } else if (s) {
    863                s->locked_perm |= bit;
    864            }
    865        } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
    866            ret = qemu_unlock_fd(fd, off, 1);
    867            if (ret) {
    868                error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
    869                return ret;
    870            } else if (s) {
    871                s->locked_perm &= ~bit;
    872            }
    873        }
    874    }
    875    PERM_FOREACH(i) {
    876        int off = RAW_LOCK_SHARED_BASE + i;
    877        uint64_t bit = (1ULL << i);
    878        if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
    879            ret = qemu_lock_fd(fd, off, 1, false);
    880            if (ret) {
    881                raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
    882                                          off);
    883                return ret;
    884            } else if (s) {
    885                s->locked_shared_perm |= bit;
    886            }
    887        } else if (unlock && (locked_shared_perm & bit) &&
    888                   !(shared_perm_lock_bits & bit)) {
    889            ret = qemu_unlock_fd(fd, off, 1);
    890            if (ret) {
    891                error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
    892                return ret;
    893            } else if (s) {
    894                s->locked_shared_perm &= ~bit;
    895            }
    896        }
    897    }
    898    return 0;
    899}
    900
    901/* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
    902static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
    903                                Error **errp)
    904{
    905    int ret;
    906    int i;
    907
    908    PERM_FOREACH(i) {
    909        int off = RAW_LOCK_SHARED_BASE + i;
    910        uint64_t p = 1ULL << i;
    911        if (perm & p) {
    912            ret = qemu_lock_fd_test(fd, off, 1, true);
    913            if (ret) {
    914                char *perm_name = bdrv_perm_names(p);
    915
    916                raw_lock_error_setg_errno(errp, -ret,
    917                                          "Failed to get \"%s\" lock",
    918                                          perm_name);
    919                g_free(perm_name);
    920                return ret;
    921            }
    922        }
    923    }
    924    PERM_FOREACH(i) {
    925        int off = RAW_LOCK_PERM_BASE + i;
    926        uint64_t p = 1ULL << i;
    927        if (!(shared_perm & p)) {
    928            ret = qemu_lock_fd_test(fd, off, 1, true);
    929            if (ret) {
    930                char *perm_name = bdrv_perm_names(p);
    931
    932                raw_lock_error_setg_errno(errp, -ret,
    933                                          "Failed to get shared \"%s\" lock",
    934                                          perm_name);
    935                g_free(perm_name);
    936                return ret;
    937            }
    938        }
    939    }
    940    return 0;
    941}
    942
    943static int raw_handle_perm_lock(BlockDriverState *bs,
    944                                RawPermLockOp op,
    945                                uint64_t new_perm, uint64_t new_shared,
    946                                Error **errp)
    947{
    948    BDRVRawState *s = bs->opaque;
    949    int ret = 0;
    950    Error *local_err = NULL;
    951
    952    if (!s->use_lock) {
    953        return 0;
    954    }
    955
    956    if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
    957        return 0;
    958    }
    959
    960    switch (op) {
    961    case RAW_PL_PREPARE:
    962        if ((s->perm | new_perm) == s->perm &&
    963            (s->shared_perm & new_shared) == s->shared_perm)
    964        {
    965            /*
    966             * We are going to unlock bytes, it should not fail. If it fail due
    967             * to some fs-dependent permission-unrelated reasons (which occurs
    968             * sometimes on NFS and leads to abort in bdrv_replace_child) we
    969             * can't prevent such errors by any check here. And we ignore them
    970             * anyway in ABORT and COMMIT.
    971             */
    972            return 0;
    973        }
    974        ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
    975                                   ~s->shared_perm | ~new_shared,
    976                                   false, errp);
    977        if (!ret) {
    978            ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp);
    979            if (!ret) {
    980                return 0;
    981            }
    982            error_append_hint(errp,
    983                              "Is another process using the image [%s]?\n",
    984                              bs->filename);
    985        }
    986        /* fall through to unlock bytes. */
    987    case RAW_PL_ABORT:
    988        raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm,
    989                             true, &local_err);
    990        if (local_err) {
    991            /* Theoretically the above call only unlocks bytes and it cannot
    992             * fail. Something weird happened, report it.
    993             */
    994            warn_report_err(local_err);
    995        }
    996        break;
    997    case RAW_PL_COMMIT:
    998        raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared,
    999                             true, &local_err);
   1000        if (local_err) {
   1001            /* Theoretically the above call only unlocks bytes and it cannot
   1002             * fail. Something weird happened, report it.
   1003             */
   1004            warn_report_err(local_err);
   1005        }
   1006        break;
   1007    }
   1008    return ret;
   1009}
   1010
   1011static int raw_reconfigure_getfd(BlockDriverState *bs, int flags,
   1012                                 int *open_flags, uint64_t perm, bool force_dup,
   1013                                 Error **errp)
   1014{
   1015    BDRVRawState *s = bs->opaque;
   1016    int fd = -1;
   1017    int ret;
   1018    bool has_writers = perm &
   1019        (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_RESIZE);
   1020    int fcntl_flags = O_APPEND | O_NONBLOCK;
   1021#ifdef O_NOATIME
   1022    fcntl_flags |= O_NOATIME;
   1023#endif
   1024
   1025    *open_flags = 0;
   1026    if (s->type == FTYPE_CD) {
   1027        *open_flags |= O_NONBLOCK;
   1028    }
   1029
   1030    raw_parse_flags(flags, open_flags, has_writers);
   1031
   1032#ifdef O_ASYNC
   1033    /* Not all operating systems have O_ASYNC, and those that don't
   1034     * will not let us track the state into rs->open_flags (typically
   1035     * you achieve the same effect with an ioctl, for example I_SETSIG
   1036     * on Solaris). But we do not use O_ASYNC, so that's fine.
   1037     */
   1038    assert((s->open_flags & O_ASYNC) == 0);
   1039#endif
   1040
   1041    if (!force_dup && *open_flags == s->open_flags) {
   1042        /* We're lucky, the existing fd is fine */
   1043        return s->fd;
   1044    }
   1045
   1046    if ((*open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
   1047        /* dup the original fd */
   1048        fd = qemu_dup(s->fd);
   1049        if (fd >= 0) {
   1050            ret = fcntl_setfl(fd, *open_flags);
   1051            if (ret) {
   1052                qemu_close(fd);
   1053                fd = -1;
   1054            }
   1055        }
   1056    }
   1057
   1058    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
   1059    if (fd == -1) {
   1060        const char *normalized_filename = bs->filename;
   1061        ret = raw_normalize_devicepath(&normalized_filename, errp);
   1062        if (ret >= 0) {
   1063            fd = qemu_open(normalized_filename, *open_flags, errp);
   1064            if (fd == -1) {
   1065                return -1;
   1066            }
   1067        }
   1068    }
   1069
   1070    if (fd != -1 && (*open_flags & O_RDWR)) {
   1071        ret = check_hdev_writable(fd);
   1072        if (ret < 0) {
   1073            qemu_close(fd);
   1074            error_setg_errno(errp, -ret, "The device is not writable");
   1075            return -1;
   1076        }
   1077    }
   1078
   1079    return fd;
   1080}
   1081
   1082static int raw_reopen_prepare(BDRVReopenState *state,
   1083                              BlockReopenQueue *queue, Error **errp)
   1084{
   1085    BDRVRawState *s;
   1086    BDRVRawReopenState *rs;
   1087    QemuOpts *opts;
   1088    int ret;
   1089
   1090    assert(state != NULL);
   1091    assert(state->bs != NULL);
   1092
   1093    s = state->bs->opaque;
   1094
   1095    state->opaque = g_new0(BDRVRawReopenState, 1);
   1096    rs = state->opaque;
   1097
   1098    /* Handle options changes */
   1099    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
   1100    if (!qemu_opts_absorb_qdict(opts, state->options, errp)) {
   1101        ret = -EINVAL;
   1102        goto out;
   1103    }
   1104
   1105    rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true);
   1106    rs->check_cache_dropped =
   1107        qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
   1108
   1109    /* This driver's reopen function doesn't currently allow changing
   1110     * other options, so let's put them back in the original QDict and
   1111     * bdrv_reopen_prepare() will detect changes and complain. */
   1112    qemu_opts_to_qdict(opts, state->options);
   1113
   1114    /*
   1115     * As part of reopen prepare we also want to create new fd by
   1116     * raw_reconfigure_getfd(). But it wants updated "perm", when in
   1117     * bdrv_reopen_multiple() .bdrv_reopen_prepare() callback called prior to
   1118     * permission update. Happily, permission update is always a part (a seprate
   1119     * stage) of bdrv_reopen_multiple() so we can rely on this fact and
   1120     * reconfigure fd in raw_check_perm().
   1121     */
   1122
   1123    s->reopen_state = state;
   1124    ret = 0;
   1125
   1126out:
   1127    qemu_opts_del(opts);
   1128    return ret;
   1129}
   1130
   1131static void raw_reopen_commit(BDRVReopenState *state)
   1132{
   1133    BDRVRawReopenState *rs = state->opaque;
   1134    BDRVRawState *s = state->bs->opaque;
   1135
   1136    s->drop_cache = rs->drop_cache;
   1137    s->check_cache_dropped = rs->check_cache_dropped;
   1138    s->open_flags = rs->open_flags;
   1139    g_free(state->opaque);
   1140    state->opaque = NULL;
   1141
   1142    assert(s->reopen_state == state);
   1143    s->reopen_state = NULL;
   1144}
   1145
   1146
   1147static void raw_reopen_abort(BDRVReopenState *state)
   1148{
   1149    BDRVRawReopenState *rs = state->opaque;
   1150    BDRVRawState *s = state->bs->opaque;
   1151
   1152     /* nothing to do if NULL, we didn't get far enough */
   1153    if (rs == NULL) {
   1154        return;
   1155    }
   1156
   1157    g_free(state->opaque);
   1158    state->opaque = NULL;
   1159
   1160    assert(s->reopen_state == state);
   1161    s->reopen_state = NULL;
   1162}
   1163
   1164static int hdev_get_max_hw_transfer(int fd, struct stat *st)
   1165{
   1166#ifdef BLKSECTGET
   1167    if (S_ISBLK(st->st_mode)) {
   1168        unsigned short max_sectors = 0;
   1169        if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
   1170            return max_sectors * 512;
   1171        }
   1172    } else {
   1173        int max_bytes = 0;
   1174        if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
   1175            return max_bytes;
   1176        }
   1177    }
   1178    return -errno;
   1179#else
   1180    return -ENOSYS;
   1181#endif
   1182}
   1183
   1184static int hdev_get_max_segments(int fd, struct stat *st)
   1185{
   1186#ifdef CONFIG_LINUX
   1187    char buf[32];
   1188    const char *end;
   1189    char *sysfspath = NULL;
   1190    int ret;
   1191    int sysfd = -1;
   1192    long max_segments;
   1193
   1194    if (S_ISCHR(st->st_mode)) {
   1195        if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
   1196            return ret;
   1197        }
   1198        return -ENOTSUP;
   1199    }
   1200
   1201    if (!S_ISBLK(st->st_mode)) {
   1202        return -ENOTSUP;
   1203    }
   1204
   1205    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
   1206                                major(st->st_rdev), minor(st->st_rdev));
   1207    sysfd = open(sysfspath, O_RDONLY);
   1208    if (sysfd == -1) {
   1209        ret = -errno;
   1210        goto out;
   1211    }
   1212    do {
   1213        ret = read(sysfd, buf, sizeof(buf) - 1);
   1214    } while (ret == -1 && errno == EINTR);
   1215    if (ret < 0) {
   1216        ret = -errno;
   1217        goto out;
   1218    } else if (ret == 0) {
   1219        ret = -EIO;
   1220        goto out;
   1221    }
   1222    buf[ret] = 0;
   1223    /* The file is ended with '\n', pass 'end' to accept that. */
   1224    ret = qemu_strtol(buf, &end, 10, &max_segments);
   1225    if (ret == 0 && end && *end == '\n') {
   1226        ret = max_segments;
   1227    }
   1228
   1229out:
   1230    if (sysfd != -1) {
   1231        close(sysfd);
   1232    }
   1233    g_free(sysfspath);
   1234    return ret;
   1235#else
   1236    return -ENOTSUP;
   1237#endif
   1238}
   1239
   1240static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
   1241{
   1242    BDRVRawState *s = bs->opaque;
   1243    struct stat st;
   1244
   1245    raw_probe_alignment(bs, s->fd, errp);
   1246    bs->bl.min_mem_alignment = s->buf_align;
   1247    bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size);
   1248
   1249    /*
   1250     * Maximum transfers are best effort, so it is okay to ignore any
   1251     * errors.  That said, based on the man page errors in fstat would be
   1252     * very much unexpected; the only possible case seems to be ENOMEM.
   1253     */
   1254    if (fstat(s->fd, &st)) {
   1255        return;
   1256    }
   1257
   1258#if defined(__APPLE__) && (__MACH__)
   1259    struct statfs buf;
   1260
   1261    if (!fstatfs(s->fd, &buf)) {
   1262        bs->bl.opt_transfer = buf.f_iosize;
   1263        bs->bl.pdiscard_alignment = buf.f_bsize;
   1264    }
   1265#endif
   1266
   1267    if (bs->sg || S_ISBLK(st.st_mode)) {
   1268        int ret = hdev_get_max_hw_transfer(s->fd, &st);
   1269
   1270        if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
   1271            bs->bl.max_hw_transfer = ret;
   1272        }
   1273
   1274        ret = hdev_get_max_segments(s->fd, &st);
   1275        if (ret > 0) {
   1276            bs->bl.max_hw_iov = ret;
   1277        }
   1278    }
   1279}
   1280
   1281static int check_for_dasd(int fd)
   1282{
   1283#ifdef BIODASDINFO2
   1284    struct dasd_information2_t info = {0};
   1285
   1286    return ioctl(fd, BIODASDINFO2, &info);
   1287#else
   1288    return -1;
   1289#endif
   1290}
   1291
   1292/**
   1293 * Try to get @bs's logical and physical block size.
   1294 * On success, store them in @bsz and return zero.
   1295 * On failure, return negative errno.
   1296 */
   1297static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
   1298{
   1299    BDRVRawState *s = bs->opaque;
   1300    int ret;
   1301
   1302    /* If DASD, get blocksizes */
   1303    if (check_for_dasd(s->fd) < 0) {
   1304        return -ENOTSUP;
   1305    }
   1306    ret = probe_logical_blocksize(s->fd, &bsz->log);
   1307    if (ret < 0) {
   1308        return ret;
   1309    }
   1310    return probe_physical_blocksize(s->fd, &bsz->phys);
   1311}
   1312
   1313/**
   1314 * Try to get @bs's geometry: cyls, heads, sectors.
   1315 * On success, store them in @geo and return 0.
   1316 * On failure return -errno.
   1317 * (Allows block driver to assign default geometry values that guest sees)
   1318 */
   1319#ifdef __linux__
   1320static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
   1321{
   1322    BDRVRawState *s = bs->opaque;
   1323    struct hd_geometry ioctl_geo = {0};
   1324
   1325    /* If DASD, get its geometry */
   1326    if (check_for_dasd(s->fd) < 0) {
   1327        return -ENOTSUP;
   1328    }
   1329    if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
   1330        return -errno;
   1331    }
   1332    /* HDIO_GETGEO may return success even though geo contains zeros
   1333       (e.g. certain multipath setups) */
   1334    if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
   1335        return -ENOTSUP;
   1336    }
   1337    /* Do not return a geometry for partition */
   1338    if (ioctl_geo.start != 0) {
   1339        return -ENOTSUP;
   1340    }
   1341    geo->heads = ioctl_geo.heads;
   1342    geo->sectors = ioctl_geo.sectors;
   1343    geo->cylinders = ioctl_geo.cylinders;
   1344
   1345    return 0;
   1346}
   1347#else /* __linux__ */
   1348static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
   1349{
   1350    return -ENOTSUP;
   1351}
   1352#endif
   1353
   1354#if defined(__linux__)
   1355static int handle_aiocb_ioctl(void *opaque)
   1356{
   1357    RawPosixAIOData *aiocb = opaque;
   1358    int ret;
   1359
   1360    do {
   1361        ret = ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf);
   1362    } while (ret == -1 && errno == EINTR);
   1363    if (ret == -1) {
   1364        return -errno;
   1365    }
   1366
   1367    return 0;
   1368}
   1369#endif /* linux */
   1370
   1371static int handle_aiocb_flush(void *opaque)
   1372{
   1373    RawPosixAIOData *aiocb = opaque;
   1374    BDRVRawState *s = aiocb->bs->opaque;
   1375    int ret;
   1376
   1377    if (s->page_cache_inconsistent) {
   1378        return -s->page_cache_inconsistent;
   1379    }
   1380
   1381    ret = qemu_fdatasync(aiocb->aio_fildes);
   1382    if (ret == -1) {
   1383        trace_file_flush_fdatasync_failed(errno);
   1384
   1385        /* There is no clear definition of the semantics of a failing fsync(),
   1386         * so we may have to assume the worst. The sad truth is that this
   1387         * assumption is correct for Linux. Some pages are now probably marked
   1388         * clean in the page cache even though they are inconsistent with the
   1389         * on-disk contents. The next fdatasync() call would succeed, but no
   1390         * further writeback attempt will be made. We can't get back to a state
   1391         * in which we know what is on disk (we would have to rewrite
   1392         * everything that was touched since the last fdatasync() at least), so
   1393         * make bdrv_flush() fail permanently. Given that the behaviour isn't
   1394         * really defined, I have little hope that other OSes are doing better.
   1395         *
   1396         * Obviously, this doesn't affect O_DIRECT, which bypasses the page
   1397         * cache. */
   1398        if ((s->open_flags & O_DIRECT) == 0) {
   1399            s->page_cache_inconsistent = errno;
   1400        }
   1401        return -errno;
   1402    }
   1403    return 0;
   1404}
   1405
   1406#ifdef CONFIG_PREADV
   1407
   1408static bool preadv_present = true;
   1409
   1410static ssize_t
   1411qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
   1412{
   1413    return preadv(fd, iov, nr_iov, offset);
   1414}
   1415
   1416static ssize_t
   1417qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
   1418{
   1419    return pwritev(fd, iov, nr_iov, offset);
   1420}
   1421
   1422#else
   1423
   1424static bool preadv_present = false;
   1425
   1426static ssize_t
   1427qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
   1428{
   1429    return -ENOSYS;
   1430}
   1431
   1432static ssize_t
   1433qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
   1434{
   1435    return -ENOSYS;
   1436}
   1437
   1438#endif
   1439
   1440static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
   1441{
   1442    ssize_t len;
   1443
   1444    do {
   1445        if (aiocb->aio_type & QEMU_AIO_WRITE)
   1446            len = qemu_pwritev(aiocb->aio_fildes,
   1447                               aiocb->io.iov,
   1448                               aiocb->io.niov,
   1449                               aiocb->aio_offset);
   1450         else
   1451            len = qemu_preadv(aiocb->aio_fildes,
   1452                              aiocb->io.iov,
   1453                              aiocb->io.niov,
   1454                              aiocb->aio_offset);
   1455    } while (len == -1 && errno == EINTR);
   1456
   1457    if (len == -1) {
   1458        return -errno;
   1459    }
   1460    return len;
   1461}
   1462
   1463/*
   1464 * Read/writes the data to/from a given linear buffer.
   1465 *
   1466 * Returns the number of bytes handles or -errno in case of an error. Short
   1467 * reads are only returned if the end of the file is reached.
   1468 */
   1469static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
   1470{
   1471    ssize_t offset = 0;
   1472    ssize_t len;
   1473
   1474    while (offset < aiocb->aio_nbytes) {
   1475        if (aiocb->aio_type & QEMU_AIO_WRITE) {
   1476            len = pwrite(aiocb->aio_fildes,
   1477                         (const char *)buf + offset,
   1478                         aiocb->aio_nbytes - offset,
   1479                         aiocb->aio_offset + offset);
   1480        } else {
   1481            len = pread(aiocb->aio_fildes,
   1482                        buf + offset,
   1483                        aiocb->aio_nbytes - offset,
   1484                        aiocb->aio_offset + offset);
   1485        }
   1486        if (len == -1 && errno == EINTR) {
   1487            continue;
   1488        } else if (len == -1 && errno == EINVAL &&
   1489                   (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
   1490                   !(aiocb->aio_type & QEMU_AIO_WRITE) &&
   1491                   offset > 0) {
   1492            /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
   1493             * after a short read.  Assume that O_DIRECT short reads only occur
   1494             * at EOF.  Therefore this is a short read, not an I/O error.
   1495             */
   1496            break;
   1497        } else if (len == -1) {
   1498            offset = -errno;
   1499            break;
   1500        } else if (len == 0) {
   1501            break;
   1502        }
   1503        offset += len;
   1504    }
   1505
   1506    return offset;
   1507}
   1508
   1509static int handle_aiocb_rw(void *opaque)
   1510{
   1511    RawPosixAIOData *aiocb = opaque;
   1512    ssize_t nbytes;
   1513    char *buf;
   1514
   1515    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
   1516        /*
   1517         * If there is just a single buffer, and it is properly aligned
   1518         * we can just use plain pread/pwrite without any problems.
   1519         */
   1520        if (aiocb->io.niov == 1) {
   1521            nbytes = handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base);
   1522            goto out;
   1523        }
   1524        /*
   1525         * We have more than one iovec, and all are properly aligned.
   1526         *
   1527         * Try preadv/pwritev first and fall back to linearizing the
   1528         * buffer if it's not supported.
   1529         */
   1530        if (preadv_present) {
   1531            nbytes = handle_aiocb_rw_vector(aiocb);
   1532            if (nbytes == aiocb->aio_nbytes ||
   1533                (nbytes < 0 && nbytes != -ENOSYS)) {
   1534                goto out;
   1535            }
   1536            preadv_present = false;
   1537        }
   1538
   1539        /*
   1540         * XXX(hch): short read/write.  no easy way to handle the reminder
   1541         * using these interfaces.  For now retry using plain
   1542         * pread/pwrite?
   1543         */
   1544    }
   1545
   1546    /*
   1547     * Ok, we have to do it the hard way, copy all segments into
   1548     * a single aligned buffer.
   1549     */
   1550    buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
   1551    if (buf == NULL) {
   1552        nbytes = -ENOMEM;
   1553        goto out;
   1554    }
   1555
   1556    if (aiocb->aio_type & QEMU_AIO_WRITE) {
   1557        char *p = buf;
   1558        int i;
   1559
   1560        for (i = 0; i < aiocb->io.niov; ++i) {
   1561            memcpy(p, aiocb->io.iov[i].iov_base, aiocb->io.iov[i].iov_len);
   1562            p += aiocb->io.iov[i].iov_len;
   1563        }
   1564        assert(p - buf == aiocb->aio_nbytes);
   1565    }
   1566
   1567    nbytes = handle_aiocb_rw_linear(aiocb, buf);
   1568    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
   1569        char *p = buf;
   1570        size_t count = aiocb->aio_nbytes, copy;
   1571        int i;
   1572
   1573        for (i = 0; i < aiocb->io.niov && count; ++i) {
   1574            copy = count;
   1575            if (copy > aiocb->io.iov[i].iov_len) {
   1576                copy = aiocb->io.iov[i].iov_len;
   1577            }
   1578            memcpy(aiocb->io.iov[i].iov_base, p, copy);
   1579            assert(count >= copy);
   1580            p     += copy;
   1581            count -= copy;
   1582        }
   1583        assert(count == 0);
   1584    }
   1585    qemu_vfree(buf);
   1586
   1587out:
   1588    if (nbytes == aiocb->aio_nbytes) {
   1589        return 0;
   1590    } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
   1591        if (aiocb->aio_type & QEMU_AIO_WRITE) {
   1592            return -EINVAL;
   1593        } else {
   1594            iov_memset(aiocb->io.iov, aiocb->io.niov, nbytes,
   1595                      0, aiocb->aio_nbytes - nbytes);
   1596            return 0;
   1597        }
   1598    } else {
   1599        assert(nbytes < 0);
   1600        return nbytes;
   1601    }
   1602}
   1603
   1604#if defined(CONFIG_FALLOCATE) || defined(BLKZEROOUT) || defined(BLKDISCARD)
   1605static int translate_err(int err)
   1606{
   1607    if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
   1608        err == -ENOTTY) {
   1609        err = -ENOTSUP;
   1610    }
   1611    return err;
   1612}
   1613#endif
   1614
   1615#ifdef CONFIG_FALLOCATE
   1616static int do_fallocate(int fd, int mode, off_t offset, off_t len)
   1617{
   1618    do {
   1619        if (fallocate(fd, mode, offset, len) == 0) {
   1620            return 0;
   1621        }
   1622    } while (errno == EINTR);
   1623    return translate_err(-errno);
   1624}
   1625#endif
   1626
   1627static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
   1628{
   1629    int ret = -ENOTSUP;
   1630    BDRVRawState *s = aiocb->bs->opaque;
   1631
   1632    if (!s->has_write_zeroes) {
   1633        return -ENOTSUP;
   1634    }
   1635
   1636#ifdef BLKZEROOUT
   1637    /* The BLKZEROOUT implementation in the kernel doesn't set
   1638     * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
   1639     * fallbacks. */
   1640    if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
   1641        do {
   1642            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
   1643            if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
   1644                return 0;
   1645            }
   1646        } while (errno == EINTR);
   1647
   1648        ret = translate_err(-errno);
   1649        if (ret == -ENOTSUP) {
   1650            s->has_write_zeroes = false;
   1651        }
   1652    }
   1653#endif
   1654
   1655    return ret;
   1656}
   1657
   1658static int handle_aiocb_write_zeroes(void *opaque)
   1659{
   1660    RawPosixAIOData *aiocb = opaque;
   1661#ifdef CONFIG_FALLOCATE
   1662    BDRVRawState *s = aiocb->bs->opaque;
   1663    int64_t len;
   1664#endif
   1665
   1666    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
   1667        return handle_aiocb_write_zeroes_block(aiocb);
   1668    }
   1669
   1670#ifdef CONFIG_FALLOCATE_ZERO_RANGE
   1671    if (s->has_write_zeroes) {
   1672        int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
   1673                               aiocb->aio_offset, aiocb->aio_nbytes);
   1674        if (ret == -ENOTSUP) {
   1675            s->has_write_zeroes = false;
   1676        } else if (ret == 0 || ret != -EINVAL) {
   1677            return ret;
   1678        }
   1679        /*
   1680         * Note: Some file systems do not like unaligned byte ranges, and
   1681         * return EINVAL in such a case, though they should not do it according
   1682         * to the man-page of fallocate(). Thus we simply ignore this return
   1683         * value and try the other fallbacks instead.
   1684         */
   1685    }
   1686#endif
   1687
   1688#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
   1689    if (s->has_discard && s->has_fallocate) {
   1690        int ret = do_fallocate(s->fd,
   1691                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
   1692                               aiocb->aio_offset, aiocb->aio_nbytes);
   1693        if (ret == 0) {
   1694            ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
   1695            if (ret == 0 || ret != -ENOTSUP) {
   1696                return ret;
   1697            }
   1698            s->has_fallocate = false;
   1699        } else if (ret == -EINVAL) {
   1700            /*
   1701             * Some file systems like older versions of GPFS do not like un-
   1702             * aligned byte ranges, and return EINVAL in such a case, though
   1703             * they should not do it according to the man-page of fallocate().
   1704             * Warn about the bad filesystem and try the final fallback instead.
   1705             */
   1706            warn_report_once("Your file system is misbehaving: "
   1707                             "fallocate(FALLOC_FL_PUNCH_HOLE) returned EINVAL. "
   1708                             "Please report this bug to your file system "
   1709                             "vendor.");
   1710        } else if (ret != -ENOTSUP) {
   1711            return ret;
   1712        } else {
   1713            s->has_discard = false;
   1714        }
   1715    }
   1716#endif
   1717
   1718#ifdef CONFIG_FALLOCATE
   1719    /* Last resort: we are trying to extend the file with zeroed data. This
   1720     * can be done via fallocate(fd, 0) */
   1721    len = bdrv_getlength(aiocb->bs);
   1722    if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
   1723        int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
   1724        if (ret == 0 || ret != -ENOTSUP) {
   1725            return ret;
   1726        }
   1727        s->has_fallocate = false;
   1728    }
   1729#endif
   1730
   1731    return -ENOTSUP;
   1732}
   1733
   1734static int handle_aiocb_write_zeroes_unmap(void *opaque)
   1735{
   1736    RawPosixAIOData *aiocb = opaque;
   1737    BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
   1738
   1739    /* First try to write zeros and unmap at the same time */
   1740
   1741#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
   1742    int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
   1743                           aiocb->aio_offset, aiocb->aio_nbytes);
   1744    switch (ret) {
   1745    case -ENOTSUP:
   1746    case -EINVAL:
   1747    case -EBUSY:
   1748        break;
   1749    default:
   1750        return ret;
   1751    }
   1752#endif
   1753
   1754    /* If we couldn't manage to unmap while guaranteed that the area reads as
   1755     * all-zero afterwards, just write zeroes without unmapping */
   1756    return handle_aiocb_write_zeroes(aiocb);
   1757}
   1758
   1759#ifndef HAVE_COPY_FILE_RANGE
   1760static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
   1761                             off_t *out_off, size_t len, unsigned int flags)
   1762{
   1763#ifdef __NR_copy_file_range
   1764    return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
   1765                   out_off, len, flags);
   1766#else
   1767    errno = ENOSYS;
   1768    return -1;
   1769#endif
   1770}
   1771#endif
   1772
   1773static int handle_aiocb_copy_range(void *opaque)
   1774{
   1775    RawPosixAIOData *aiocb = opaque;
   1776    uint64_t bytes = aiocb->aio_nbytes;
   1777    off_t in_off = aiocb->aio_offset;
   1778    off_t out_off = aiocb->copy_range.aio_offset2;
   1779
   1780    while (bytes) {
   1781        ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
   1782                                      aiocb->copy_range.aio_fd2, &out_off,
   1783                                      bytes, 0);
   1784        trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
   1785                                   aiocb->copy_range.aio_fd2, out_off, bytes,
   1786                                   0, ret);
   1787        if (ret == 0) {
   1788            /* No progress (e.g. when beyond EOF), let the caller fall back to
   1789             * buffer I/O. */
   1790            return -ENOSPC;
   1791        }
   1792        if (ret < 0) {
   1793            switch (errno) {
   1794            case ENOSYS:
   1795                return -ENOTSUP;
   1796            case EINTR:
   1797                continue;
   1798            default:
   1799                return -errno;
   1800            }
   1801        }
   1802        bytes -= ret;
   1803    }
   1804    return 0;
   1805}
   1806
   1807static int handle_aiocb_discard(void *opaque)
   1808{
   1809    RawPosixAIOData *aiocb = opaque;
   1810    int ret = -EOPNOTSUPP;
   1811    BDRVRawState *s = aiocb->bs->opaque;
   1812
   1813    if (!s->has_discard) {
   1814        return -ENOTSUP;
   1815    }
   1816
   1817    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
   1818#ifdef BLKDISCARD
   1819        do {
   1820            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
   1821            if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
   1822                return 0;
   1823            }
   1824        } while (errno == EINTR);
   1825
   1826        ret = translate_err(-errno);
   1827#endif
   1828    } else {
   1829#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
   1830        ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
   1831                           aiocb->aio_offset, aiocb->aio_nbytes);
   1832        ret = translate_err(-errno);
   1833#elif defined(__APPLE__) && (__MACH__)
   1834        fpunchhole_t fpunchhole;
   1835        fpunchhole.fp_flags = 0;
   1836        fpunchhole.reserved = 0;
   1837        fpunchhole.fp_offset = aiocb->aio_offset;
   1838        fpunchhole.fp_length = aiocb->aio_nbytes;
   1839        if (fcntl(s->fd, F_PUNCHHOLE, &fpunchhole) == -1) {
   1840            ret = errno == ENODEV ? -ENOTSUP : -errno;
   1841        } else {
   1842            ret = 0;
   1843        }
   1844#endif
   1845    }
   1846
   1847    if (ret == -ENOTSUP) {
   1848        s->has_discard = false;
   1849    }
   1850    return ret;
   1851}
   1852
   1853/*
   1854 * Help alignment probing by allocating the first block.
   1855 *
   1856 * When reading with direct I/O from unallocated area on Gluster backed by XFS,
   1857 * reading succeeds regardless of request length. In this case we fallback to
   1858 * safe alignment which is not optimal. Allocating the first block avoids this
   1859 * fallback.
   1860 *
   1861 * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
   1862 * request alignment, so we use safe values.
   1863 *
   1864 * Returns: 0 on success, -errno on failure. Since this is an optimization,
   1865 * caller may ignore failures.
   1866 */
   1867static int allocate_first_block(int fd, size_t max_size)
   1868{
   1869    size_t write_size = (max_size < MAX_BLOCKSIZE)
   1870        ? BDRV_SECTOR_SIZE
   1871        : MAX_BLOCKSIZE;
   1872    size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size);
   1873    void *buf;
   1874    ssize_t n;
   1875    int ret;
   1876
   1877    buf = qemu_memalign(max_align, write_size);
   1878    memset(buf, 0, write_size);
   1879
   1880    do {
   1881        n = pwrite(fd, buf, write_size, 0);
   1882    } while (n == -1 && errno == EINTR);
   1883
   1884    ret = (n == -1) ? -errno : 0;
   1885
   1886    qemu_vfree(buf);
   1887    return ret;
   1888}
   1889
   1890static int handle_aiocb_truncate(void *opaque)
   1891{
   1892    RawPosixAIOData *aiocb = opaque;
   1893    int result = 0;
   1894    int64_t current_length = 0;
   1895    char *buf = NULL;
   1896    struct stat st;
   1897    int fd = aiocb->aio_fildes;
   1898    int64_t offset = aiocb->aio_offset;
   1899    PreallocMode prealloc = aiocb->truncate.prealloc;
   1900    Error **errp = aiocb->truncate.errp;
   1901
   1902    if (fstat(fd, &st) < 0) {
   1903        result = -errno;
   1904        error_setg_errno(errp, -result, "Could not stat file");
   1905        return result;
   1906    }
   1907
   1908    current_length = st.st_size;
   1909    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
   1910        error_setg(errp, "Cannot use preallocation for shrinking files");
   1911        return -ENOTSUP;
   1912    }
   1913
   1914    switch (prealloc) {
   1915#ifdef CONFIG_POSIX_FALLOCATE
   1916    case PREALLOC_MODE_FALLOC:
   1917        /*
   1918         * Truncating before posix_fallocate() makes it about twice slower on
   1919         * file systems that do not support fallocate(), trying to check if a
   1920         * block is allocated before allocating it, so don't do that here.
   1921         */
   1922        if (offset != current_length) {
   1923            result = -posix_fallocate(fd, current_length,
   1924                                      offset - current_length);
   1925            if (result != 0) {
   1926                /* posix_fallocate() doesn't set errno. */
   1927                error_setg_errno(errp, -result,
   1928                                 "Could not preallocate new data");
   1929            } else if (current_length == 0) {
   1930                /*
   1931                 * posix_fallocate() uses fallocate() if the filesystem
   1932                 * supports it, or fallback to manually writing zeroes. If
   1933                 * fallocate() was used, unaligned reads from the fallocated
   1934                 * area in raw_probe_alignment() will succeed, hence we need to
   1935                 * allocate the first block.
   1936                 *
   1937                 * Optimize future alignment probing; ignore failures.
   1938                 */
   1939                allocate_first_block(fd, offset);
   1940            }
   1941        } else {
   1942            result = 0;
   1943        }
   1944        goto out;
   1945#endif
   1946    case PREALLOC_MODE_FULL:
   1947    {
   1948        int64_t num = 0, left = offset - current_length;
   1949        off_t seek_result;
   1950
   1951        /*
   1952         * Knowing the final size from the beginning could allow the file
   1953         * system driver to do less allocations and possibly avoid
   1954         * fragmentation of the file.
   1955         */
   1956        if (ftruncate(fd, offset) != 0) {
   1957            result = -errno;
   1958            error_setg_errno(errp, -result, "Could not resize file");
   1959            goto out;
   1960        }
   1961
   1962        buf = g_malloc0(65536);
   1963
   1964        seek_result = lseek(fd, current_length, SEEK_SET);
   1965        if (seek_result < 0) {
   1966            result = -errno;
   1967            error_setg_errno(errp, -result,
   1968                             "Failed to seek to the old end of file");
   1969            goto out;
   1970        }
   1971
   1972        while (left > 0) {
   1973            num = MIN(left, 65536);
   1974            result = write(fd, buf, num);
   1975            if (result < 0) {
   1976                if (errno == EINTR) {
   1977                    continue;
   1978                }
   1979                result = -errno;
   1980                error_setg_errno(errp, -result,
   1981                                 "Could not write zeros for preallocation");
   1982                goto out;
   1983            }
   1984            left -= result;
   1985        }
   1986        if (result >= 0) {
   1987            result = fsync(fd);
   1988            if (result < 0) {
   1989                result = -errno;
   1990                error_setg_errno(errp, -result,
   1991                                 "Could not flush file to disk");
   1992                goto out;
   1993            }
   1994        }
   1995        goto out;
   1996    }
   1997    case PREALLOC_MODE_OFF:
   1998        if (ftruncate(fd, offset) != 0) {
   1999            result = -errno;
   2000            error_setg_errno(errp, -result, "Could not resize file");
   2001        } else if (current_length == 0 && offset > current_length) {
   2002            /* Optimize future alignment probing; ignore failures. */
   2003            allocate_first_block(fd, offset);
   2004        }
   2005        return result;
   2006    default:
   2007        result = -ENOTSUP;
   2008        error_setg(errp, "Unsupported preallocation mode: %s",
   2009                   PreallocMode_str(prealloc));
   2010        return result;
   2011    }
   2012
   2013out:
   2014    if (result < 0) {
   2015        if (ftruncate(fd, current_length) < 0) {
   2016            error_report("Failed to restore old file length: %s",
   2017                         strerror(errno));
   2018        }
   2019    }
   2020
   2021    g_free(buf);
   2022    return result;
   2023}
   2024
   2025static int coroutine_fn raw_thread_pool_submit(BlockDriverState *bs,
   2026                                               ThreadPoolFunc func, void *arg)
   2027{
   2028    /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */
   2029    ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
   2030    return thread_pool_submit_co(pool, func, arg);
   2031}
   2032
   2033static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
   2034                                   uint64_t bytes, QEMUIOVector *qiov, int type)
   2035{
   2036    BDRVRawState *s = bs->opaque;
   2037    RawPosixAIOData acb;
   2038
   2039    if (fd_open(bs) < 0)
   2040        return -EIO;
   2041
   2042    /*
   2043     * When using O_DIRECT, the request must be aligned to be able to use
   2044     * either libaio or io_uring interface. If not fail back to regular thread
   2045     * pool read/write code which emulates this for us if we
   2046     * set QEMU_AIO_MISALIGNED.
   2047     */
   2048    if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
   2049        type |= QEMU_AIO_MISALIGNED;
   2050#ifdef CONFIG_LINUX_IO_URING
   2051    } else if (s->use_linux_io_uring) {
   2052        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
   2053        assert(qiov->size == bytes);
   2054        return luring_co_submit(bs, aio, s->fd, offset, qiov, type);
   2055#endif
   2056#ifdef CONFIG_LINUX_AIO
   2057    } else if (s->use_linux_aio) {
   2058        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
   2059        assert(qiov->size == bytes);
   2060        return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
   2061#endif
   2062    }
   2063
   2064    acb = (RawPosixAIOData) {
   2065        .bs             = bs,
   2066        .aio_fildes     = s->fd,
   2067        .aio_type       = type,
   2068        .aio_offset     = offset,
   2069        .aio_nbytes     = bytes,
   2070        .io             = {
   2071            .iov            = qiov->iov,
   2072            .niov           = qiov->niov,
   2073        },
   2074    };
   2075
   2076    assert(qiov->size == bytes);
   2077    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
   2078}
   2079
   2080static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
   2081                                      int64_t bytes, QEMUIOVector *qiov,
   2082                                      BdrvRequestFlags flags)
   2083{
   2084    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
   2085}
   2086
   2087static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
   2088                                       int64_t bytes, QEMUIOVector *qiov,
   2089                                       BdrvRequestFlags flags)
   2090{
   2091    assert(flags == 0);
   2092    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
   2093}
   2094
   2095static void raw_aio_plug(BlockDriverState *bs)
   2096{
   2097    BDRVRawState __attribute__((unused)) *s = bs->opaque;
   2098#ifdef CONFIG_LINUX_AIO
   2099    if (s->use_linux_aio) {
   2100        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
   2101        laio_io_plug(bs, aio);
   2102    }
   2103#endif
   2104#ifdef CONFIG_LINUX_IO_URING
   2105    if (s->use_linux_io_uring) {
   2106        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
   2107        luring_io_plug(bs, aio);
   2108    }
   2109#endif
   2110}
   2111
   2112static void raw_aio_unplug(BlockDriverState *bs)
   2113{
   2114    BDRVRawState __attribute__((unused)) *s = bs->opaque;
   2115#ifdef CONFIG_LINUX_AIO
   2116    if (s->use_linux_aio) {
   2117        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
   2118        laio_io_unplug(bs, aio);
   2119    }
   2120#endif
   2121#ifdef CONFIG_LINUX_IO_URING
   2122    if (s->use_linux_io_uring) {
   2123        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
   2124        luring_io_unplug(bs, aio);
   2125    }
   2126#endif
   2127}
   2128
   2129static int raw_co_flush_to_disk(BlockDriverState *bs)
   2130{
   2131    BDRVRawState *s = bs->opaque;
   2132    RawPosixAIOData acb;
   2133    int ret;
   2134
   2135    ret = fd_open(bs);
   2136    if (ret < 0) {
   2137        return ret;
   2138    }
   2139
   2140    acb = (RawPosixAIOData) {
   2141        .bs             = bs,
   2142        .aio_fildes     = s->fd,
   2143        .aio_type       = QEMU_AIO_FLUSH,
   2144    };
   2145
   2146#ifdef CONFIG_LINUX_IO_URING
   2147    if (s->use_linux_io_uring) {
   2148        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
   2149        return luring_co_submit(bs, aio, s->fd, 0, NULL, QEMU_AIO_FLUSH);
   2150    }
   2151#endif
   2152    return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb);
   2153}
   2154
   2155static void raw_aio_attach_aio_context(BlockDriverState *bs,
   2156                                       AioContext *new_context)
   2157{
   2158    BDRVRawState __attribute__((unused)) *s = bs->opaque;
   2159#ifdef CONFIG_LINUX_AIO
   2160    if (s->use_linux_aio) {
   2161        Error *local_err = NULL;
   2162        if (!aio_setup_linux_aio(new_context, &local_err)) {
   2163            error_reportf_err(local_err, "Unable to use native AIO, "
   2164                                         "falling back to thread pool: ");
   2165            s->use_linux_aio = false;
   2166        }
   2167    }
   2168#endif
   2169#ifdef CONFIG_LINUX_IO_URING
   2170    if (s->use_linux_io_uring) {
   2171        Error *local_err = NULL;
   2172        if (!aio_setup_linux_io_uring(new_context, &local_err)) {
   2173            error_reportf_err(local_err, "Unable to use linux io_uring, "
   2174                                         "falling back to thread pool: ");
   2175            s->use_linux_io_uring = false;
   2176        }
   2177    }
   2178#endif
   2179}
   2180
   2181static void raw_close(BlockDriverState *bs)
   2182{
   2183    BDRVRawState *s = bs->opaque;
   2184
   2185    if (s->fd >= 0) {
   2186        qemu_close(s->fd);
   2187        s->fd = -1;
   2188    }
   2189}
   2190
   2191/**
   2192 * Truncates the given regular file @fd to @offset and, when growing, fills the
   2193 * new space according to @prealloc.
   2194 *
   2195 * Returns: 0 on success, -errno on failure.
   2196 */
   2197static int coroutine_fn
   2198raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
   2199                     PreallocMode prealloc, Error **errp)
   2200{
   2201    RawPosixAIOData acb;
   2202
   2203    acb = (RawPosixAIOData) {
   2204        .bs             = bs,
   2205        .aio_fildes     = fd,
   2206        .aio_type       = QEMU_AIO_TRUNCATE,
   2207        .aio_offset     = offset,
   2208        .truncate       = {
   2209            .prealloc       = prealloc,
   2210            .errp           = errp,
   2211        },
   2212    };
   2213
   2214    return raw_thread_pool_submit(bs, handle_aiocb_truncate, &acb);
   2215}
   2216
   2217static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
   2218                                        bool exact, PreallocMode prealloc,
   2219                                        BdrvRequestFlags flags, Error **errp)
   2220{
   2221    BDRVRawState *s = bs->opaque;
   2222    struct stat st;
   2223    int ret;
   2224
   2225    if (fstat(s->fd, &st)) {
   2226        ret = -errno;
   2227        error_setg_errno(errp, -ret, "Failed to fstat() the file");
   2228        return ret;
   2229    }
   2230
   2231    if (S_ISREG(st.st_mode)) {
   2232        /* Always resizes to the exact @offset */
   2233        return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
   2234    }
   2235
   2236    if (prealloc != PREALLOC_MODE_OFF) {
   2237        error_setg(errp, "Preallocation mode '%s' unsupported for this "
   2238                   "non-regular file", PreallocMode_str(prealloc));
   2239        return -ENOTSUP;
   2240    }
   2241
   2242    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
   2243        int64_t cur_length = raw_getlength(bs);
   2244
   2245        if (offset != cur_length && exact) {
   2246            error_setg(errp, "Cannot resize device files");
   2247            return -ENOTSUP;
   2248        } else if (offset > cur_length) {
   2249            error_setg(errp, "Cannot grow device files");
   2250            return -EINVAL;
   2251        }
   2252    } else {
   2253        error_setg(errp, "Resizing this file is not supported");
   2254        return -ENOTSUP;
   2255    }
   2256
   2257    return 0;
   2258}
   2259
   2260#ifdef __OpenBSD__
   2261static int64_t raw_getlength(BlockDriverState *bs)
   2262{
   2263    BDRVRawState *s = bs->opaque;
   2264    int fd = s->fd;
   2265    struct stat st;
   2266
   2267    if (fstat(fd, &st))
   2268        return -errno;
   2269    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
   2270        struct disklabel dl;
   2271
   2272        if (ioctl(fd, DIOCGDINFO, &dl))
   2273            return -errno;
   2274        return (uint64_t)dl.d_secsize *
   2275            dl.d_partitions[DISKPART(st.st_rdev)].p_size;
   2276    } else
   2277        return st.st_size;
   2278}
   2279#elif defined(__NetBSD__)
   2280static int64_t raw_getlength(BlockDriverState *bs)
   2281{
   2282    BDRVRawState *s = bs->opaque;
   2283    int fd = s->fd;
   2284    struct stat st;
   2285
   2286    if (fstat(fd, &st))
   2287        return -errno;
   2288    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
   2289        struct dkwedge_info dkw;
   2290
   2291        if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
   2292            return dkw.dkw_size * 512;
   2293        } else {
   2294            struct disklabel dl;
   2295
   2296            if (ioctl(fd, DIOCGDINFO, &dl))
   2297                return -errno;
   2298            return (uint64_t)dl.d_secsize *
   2299                dl.d_partitions[DISKPART(st.st_rdev)].p_size;
   2300        }
   2301    } else
   2302        return st.st_size;
   2303}
   2304#elif defined(__sun__)
   2305static int64_t raw_getlength(BlockDriverState *bs)
   2306{
   2307    BDRVRawState *s = bs->opaque;
   2308    struct dk_minfo minfo;
   2309    int ret;
   2310    int64_t size;
   2311
   2312    ret = fd_open(bs);
   2313    if (ret < 0) {
   2314        return ret;
   2315    }
   2316
   2317    /*
   2318     * Use the DKIOCGMEDIAINFO ioctl to read the size.
   2319     */
   2320    ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
   2321    if (ret != -1) {
   2322        return minfo.dki_lbsize * minfo.dki_capacity;
   2323    }
   2324
   2325    /*
   2326     * There are reports that lseek on some devices fails, but
   2327     * irc discussion said that contingency on contingency was overkill.
   2328     */
   2329    size = lseek(s->fd, 0, SEEK_END);
   2330    if (size < 0) {
   2331        return -errno;
   2332    }
   2333    return size;
   2334}
   2335#elif defined(CONFIG_BSD)
   2336static int64_t raw_getlength(BlockDriverState *bs)
   2337{
   2338    BDRVRawState *s = bs->opaque;
   2339    int fd = s->fd;
   2340    int64_t size;
   2341    struct stat sb;
   2342#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
   2343    int reopened = 0;
   2344#endif
   2345    int ret;
   2346
   2347    ret = fd_open(bs);
   2348    if (ret < 0)
   2349        return ret;
   2350
   2351#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
   2352again:
   2353#endif
   2354    if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
   2355        size = 0;
   2356#ifdef DIOCGMEDIASIZE
   2357        if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) {
   2358            size = 0;
   2359        }
   2360#endif
   2361#ifdef DIOCGPART
   2362        if (size == 0) {
   2363            struct partinfo pi;
   2364            if (ioctl(fd, DIOCGPART, &pi) == 0) {
   2365                size = pi.media_size;
   2366            }
   2367        }
   2368#endif
   2369#if defined(DKIOCGETBLOCKCOUNT) && defined(DKIOCGETBLOCKSIZE)
   2370        if (size == 0) {
   2371            uint64_t sectors = 0;
   2372            uint32_t sector_size = 0;
   2373
   2374            if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
   2375               && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
   2376                size = sectors * sector_size;
   2377            }
   2378        }
   2379#endif
   2380        if (size == 0) {
   2381            size = lseek(fd, 0LL, SEEK_END);
   2382        }
   2383        if (size < 0) {
   2384            return -errno;
   2385        }
   2386#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
   2387        switch(s->type) {
   2388        case FTYPE_CD:
   2389            /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
   2390            if (size == 2048LL * (unsigned)-1)
   2391                size = 0;
   2392            /* XXX no disc?  maybe we need to reopen... */
   2393            if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
   2394                reopened = 1;
   2395                goto again;
   2396            }
   2397        }
   2398#endif
   2399    } else {
   2400        size = lseek(fd, 0, SEEK_END);
   2401        if (size < 0) {
   2402            return -errno;
   2403        }
   2404    }
   2405    return size;
   2406}
   2407#else
   2408static int64_t raw_getlength(BlockDriverState *bs)
   2409{
   2410    BDRVRawState *s = bs->opaque;
   2411    int ret;
   2412    int64_t size;
   2413
   2414    ret = fd_open(bs);
   2415    if (ret < 0) {
   2416        return ret;
   2417    }
   2418
   2419    size = lseek(s->fd, 0, SEEK_END);
   2420    if (size < 0) {
   2421        return -errno;
   2422    }
   2423    return size;
   2424}
   2425#endif
   2426
   2427static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
   2428{
   2429    struct stat st;
   2430    BDRVRawState *s = bs->opaque;
   2431
   2432    if (fstat(s->fd, &st) < 0) {
   2433        return -errno;
   2434    }
   2435    return (int64_t)st.st_blocks * 512;
   2436}
   2437
   2438static int coroutine_fn
   2439raw_co_create(BlockdevCreateOptions *options, Error **errp)
   2440{
   2441    BlockdevCreateOptionsFile *file_opts;
   2442    Error *local_err = NULL;
   2443    int fd;
   2444    uint64_t perm, shared;
   2445    int result = 0;
   2446
   2447    /* Validate options and set default values */
   2448    assert(options->driver == BLOCKDEV_DRIVER_FILE);
   2449    file_opts = &options->u.file;
   2450
   2451    if (!file_opts->has_nocow) {
   2452        file_opts->nocow = false;
   2453    }
   2454    if (!file_opts->has_preallocation) {
   2455        file_opts->preallocation = PREALLOC_MODE_OFF;
   2456    }
   2457    if (!file_opts->has_extent_size_hint) {
   2458        file_opts->extent_size_hint = 1 * MiB;
   2459    }
   2460    if (file_opts->extent_size_hint > UINT32_MAX) {
   2461        result = -EINVAL;
   2462        error_setg(errp, "Extent size hint is too large");
   2463        goto out;
   2464    }
   2465
   2466    /* Create file */
   2467    fd = qemu_create(file_opts->filename, O_RDWR | O_BINARY, 0644, errp);
   2468    if (fd < 0) {
   2469        result = -errno;
   2470        goto out;
   2471    }
   2472
   2473    /* Take permissions: We want to discard everything, so we need
   2474     * BLK_PERM_WRITE; and truncation to the desired size requires
   2475     * BLK_PERM_RESIZE.
   2476     * On the other hand, we cannot share the RESIZE permission
   2477     * because we promise that after this function, the file has the
   2478     * size given in the options.  If someone else were to resize it
   2479     * concurrently, we could not guarantee that.
   2480     * Note that after this function, we can no longer guarantee that
   2481     * the file is not touched by a third party, so it may be resized
   2482     * then. */
   2483    perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
   2484    shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
   2485
   2486    /* Step one: Take locks */
   2487    result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
   2488    if (result < 0) {
   2489        goto out_close;
   2490    }
   2491
   2492    /* Step two: Check that nobody else has taken conflicting locks */
   2493    result = raw_check_lock_bytes(fd, perm, shared, errp);
   2494    if (result < 0) {
   2495        error_append_hint(errp,
   2496                          "Is another process using the image [%s]?\n",
   2497                          file_opts->filename);
   2498        goto out_unlock;
   2499    }
   2500
   2501    /* Clear the file by truncating it to 0 */
   2502    result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp);
   2503    if (result < 0) {
   2504        goto out_unlock;
   2505    }
   2506
   2507    if (file_opts->nocow) {
   2508#ifdef __linux__
   2509        /* Set NOCOW flag to solve performance issue on fs like btrfs.
   2510         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
   2511         * will be ignored since any failure of this operation should not
   2512         * block the left work.
   2513         */
   2514        int attr;
   2515        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
   2516            attr |= FS_NOCOW_FL;
   2517            ioctl(fd, FS_IOC_SETFLAGS, &attr);
   2518        }
   2519#endif
   2520    }
   2521#ifdef FS_IOC_FSSETXATTR
   2522    /*
   2523     * Try to set the extent size hint. Failure is not fatal, and a warning is
   2524     * only printed if the option was explicitly specified.
   2525     */
   2526    {
   2527        struct fsxattr attr;
   2528        result = ioctl(fd, FS_IOC_FSGETXATTR, &attr);
   2529        if (result == 0) {
   2530            attr.fsx_xflags |= FS_XFLAG_EXTSIZE;
   2531            attr.fsx_extsize = file_opts->extent_size_hint;
   2532            result = ioctl(fd, FS_IOC_FSSETXATTR, &attr);
   2533        }
   2534        if (result < 0 && file_opts->has_extent_size_hint &&
   2535            file_opts->extent_size_hint)
   2536        {
   2537            warn_report("Failed to set extent size hint: %s",
   2538                        strerror(errno));
   2539        }
   2540    }
   2541#endif
   2542
   2543    /* Resize and potentially preallocate the file to the desired
   2544     * final size */
   2545    result = raw_regular_truncate(NULL, fd, file_opts->size,
   2546                                  file_opts->preallocation, errp);
   2547    if (result < 0) {
   2548        goto out_unlock;
   2549    }
   2550
   2551out_unlock:
   2552    raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err);
   2553    if (local_err) {
   2554        /* The above call should not fail, and if it does, that does
   2555         * not mean the whole creation operation has failed.  So
   2556         * report it the user for their convenience, but do not report
   2557         * it to the caller. */
   2558        warn_report_err(local_err);
   2559    }
   2560
   2561out_close:
   2562    if (qemu_close(fd) != 0 && result == 0) {
   2563        result = -errno;
   2564        error_setg_errno(errp, -result, "Could not close the new file");
   2565    }
   2566out:
   2567    return result;
   2568}
   2569
   2570static int coroutine_fn raw_co_create_opts(BlockDriver *drv,
   2571                                           const char *filename,
   2572                                           QemuOpts *opts,
   2573                                           Error **errp)
   2574{
   2575    BlockdevCreateOptions options;
   2576    int64_t total_size = 0;
   2577    int64_t extent_size_hint = 0;
   2578    bool has_extent_size_hint = false;
   2579    bool nocow = false;
   2580    PreallocMode prealloc;
   2581    char *buf = NULL;
   2582    Error *local_err = NULL;
   2583
   2584    /* Skip file: protocol prefix */
   2585    strstart(filename, "file:", &filename);
   2586
   2587    /* Read out options */
   2588    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
   2589                          BDRV_SECTOR_SIZE);
   2590    if (qemu_opt_get(opts, BLOCK_OPT_EXTENT_SIZE_HINT)) {
   2591        has_extent_size_hint = true;
   2592        extent_size_hint =
   2593            qemu_opt_get_size_del(opts, BLOCK_OPT_EXTENT_SIZE_HINT, -1);
   2594    }
   2595    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
   2596    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
   2597    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
   2598                               PREALLOC_MODE_OFF, &local_err);
   2599    g_free(buf);
   2600    if (local_err) {
   2601        error_propagate(errp, local_err);
   2602        return -EINVAL;
   2603    }
   2604
   2605    options = (BlockdevCreateOptions) {
   2606        .driver     = BLOCKDEV_DRIVER_FILE,
   2607        .u.file     = {
   2608            .filename           = (char *) filename,
   2609            .size               = total_size,
   2610            .has_preallocation  = true,
   2611            .preallocation      = prealloc,
   2612            .has_nocow          = true,
   2613            .nocow              = nocow,
   2614            .has_extent_size_hint = has_extent_size_hint,
   2615            .extent_size_hint   = extent_size_hint,
   2616        },
   2617    };
   2618    return raw_co_create(&options, errp);
   2619}
   2620
   2621static int coroutine_fn raw_co_delete_file(BlockDriverState *bs,
   2622                                           Error **errp)
   2623{
   2624    struct stat st;
   2625    int ret;
   2626
   2627    if (!(stat(bs->filename, &st) == 0) || !S_ISREG(st.st_mode)) {
   2628        error_setg_errno(errp, ENOENT, "%s is not a regular file",
   2629                         bs->filename);
   2630        return -ENOENT;
   2631    }
   2632
   2633    ret = unlink(bs->filename);
   2634    if (ret < 0) {
   2635        ret = -errno;
   2636        error_setg_errno(errp, -ret, "Error when deleting file %s",
   2637                         bs->filename);
   2638    }
   2639
   2640    return ret;
   2641}
   2642
   2643/*
   2644 * Find allocation range in @bs around offset @start.
   2645 * May change underlying file descriptor's file offset.
   2646 * If @start is not in a hole, store @start in @data, and the
   2647 * beginning of the next hole in @hole, and return 0.
   2648 * If @start is in a non-trailing hole, store @start in @hole and the
   2649 * beginning of the next non-hole in @data, and return 0.
   2650 * If @start is in a trailing hole or beyond EOF, return -ENXIO.
   2651 * If we can't find out, return a negative errno other than -ENXIO.
   2652 */
   2653static int find_allocation(BlockDriverState *bs, off_t start,
   2654                           off_t *data, off_t *hole)
   2655{
   2656#if defined SEEK_HOLE && defined SEEK_DATA
   2657    BDRVRawState *s = bs->opaque;
   2658    off_t offs;
   2659
   2660    /*
   2661     * SEEK_DATA cases:
   2662     * D1. offs == start: start is in data
   2663     * D2. offs > start: start is in a hole, next data at offs
   2664     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
   2665     *                              or start is beyond EOF
   2666     *     If the latter happens, the file has been truncated behind
   2667     *     our back since we opened it.  All bets are off then.
   2668     *     Treating like a trailing hole is simplest.
   2669     * D4. offs < 0, errno != ENXIO: we learned nothing
   2670     */
   2671    offs = lseek(s->fd, start, SEEK_DATA);
   2672    if (offs < 0) {
   2673        return -errno;          /* D3 or D4 */
   2674    }
   2675
   2676    if (offs < start) {
   2677        /* This is not a valid return by lseek().  We are safe to just return
   2678         * -EIO in this case, and we'll treat it like D4. */
   2679        return -EIO;
   2680    }
   2681
   2682    if (offs > start) {
   2683        /* D2: in hole, next data at offs */
   2684        *hole = start;
   2685        *data = offs;
   2686        return 0;
   2687    }
   2688
   2689    /* D1: in data, end not yet known */
   2690
   2691    /*
   2692     * SEEK_HOLE cases:
   2693     * H1. offs == start: start is in a hole
   2694     *     If this happens here, a hole has been dug behind our back
   2695     *     since the previous lseek().
   2696     * H2. offs > start: either start is in data, next hole at offs,
   2697     *                   or start is in trailing hole, EOF at offs
   2698     *     Linux treats trailing holes like any other hole: offs ==
   2699     *     start.  Solaris seeks to EOF instead: offs > start (blech).
   2700     *     If that happens here, a hole has been dug behind our back
   2701     *     since the previous lseek().
   2702     * H3. offs < 0, errno = ENXIO: start is beyond EOF
   2703     *     If this happens, the file has been truncated behind our
   2704     *     back since we opened it.  Treat it like a trailing hole.
   2705     * H4. offs < 0, errno != ENXIO: we learned nothing
   2706     *     Pretend we know nothing at all, i.e. "forget" about D1.
   2707     */
   2708    offs = lseek(s->fd, start, SEEK_HOLE);
   2709    if (offs < 0) {
   2710        return -errno;          /* D1 and (H3 or H4) */
   2711    }
   2712
   2713    if (offs < start) {
   2714        /* This is not a valid return by lseek().  We are safe to just return
   2715         * -EIO in this case, and we'll treat it like H4. */
   2716        return -EIO;
   2717    }
   2718
   2719    if (offs > start) {
   2720        /*
   2721         * D1 and H2: either in data, next hole at offs, or it was in
   2722         * data but is now in a trailing hole.  In the latter case,
   2723         * all bets are off.  Treating it as if it there was data all
   2724         * the way to EOF is safe, so simply do that.
   2725         */
   2726        *data = start;
   2727        *hole = offs;
   2728        return 0;
   2729    }
   2730
   2731    /* D1 and H1 */
   2732    return -EBUSY;
   2733#else
   2734    return -ENOTSUP;
   2735#endif
   2736}
   2737
   2738/*
   2739 * Returns the allocation status of the specified offset.
   2740 *
   2741 * The block layer guarantees 'offset' and 'bytes' are within bounds.
   2742 *
   2743 * 'pnum' is set to the number of bytes (including and immediately following
   2744 * the specified offset) that are known to be in the same
   2745 * allocated/unallocated state.
   2746 *
   2747 * 'bytes' is a soft cap for 'pnum'.  If the information is free, 'pnum' may
   2748 * well exceed it.
   2749 */
   2750static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
   2751                                            bool want_zero,
   2752                                            int64_t offset,
   2753                                            int64_t bytes, int64_t *pnum,
   2754                                            int64_t *map,
   2755                                            BlockDriverState **file)
   2756{
   2757    off_t data = 0, hole = 0;
   2758    int ret;
   2759
   2760    assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
   2761
   2762    ret = fd_open(bs);
   2763    if (ret < 0) {
   2764        return ret;
   2765    }
   2766
   2767    if (!want_zero) {
   2768        *pnum = bytes;
   2769        *map = offset;
   2770        *file = bs;
   2771        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
   2772    }
   2773
   2774    ret = find_allocation(bs, offset, &data, &hole);
   2775    if (ret == -ENXIO) {
   2776        /* Trailing hole */
   2777        *pnum = bytes;
   2778        ret = BDRV_BLOCK_ZERO;
   2779    } else if (ret < 0) {
   2780        /* No info available, so pretend there are no holes */
   2781        *pnum = bytes;
   2782        ret = BDRV_BLOCK_DATA;
   2783    } else if (data == offset) {
   2784        /* On a data extent, compute bytes to the end of the extent,
   2785         * possibly including a partial sector at EOF. */
   2786        *pnum = hole - offset;
   2787
   2788        /*
   2789         * We are not allowed to return partial sectors, though, so
   2790         * round up if necessary.
   2791         */
   2792        if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
   2793            int64_t file_length = raw_getlength(bs);
   2794            if (file_length > 0) {
   2795                /* Ignore errors, this is just a safeguard */
   2796                assert(hole == file_length);
   2797            }
   2798            *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
   2799        }
   2800
   2801        ret = BDRV_BLOCK_DATA;
   2802    } else {
   2803        /* On a hole, compute bytes to the beginning of the next extent.  */
   2804        assert(hole == offset);
   2805        *pnum = data - offset;
   2806        ret = BDRV_BLOCK_ZERO;
   2807    }
   2808    *map = offset;
   2809    *file = bs;
   2810    return ret | BDRV_BLOCK_OFFSET_VALID;
   2811}
   2812
   2813#if defined(__linux__)
   2814/* Verify that the file is not in the page cache */
   2815static void check_cache_dropped(BlockDriverState *bs, Error **errp)
   2816{
   2817    const size_t window_size = 128 * 1024 * 1024;
   2818    BDRVRawState *s = bs->opaque;
   2819    void *window = NULL;
   2820    size_t length = 0;
   2821    unsigned char *vec;
   2822    size_t page_size;
   2823    off_t offset;
   2824    off_t end;
   2825
   2826    /* mincore(2) page status information requires 1 byte per page */
   2827    page_size = sysconf(_SC_PAGESIZE);
   2828    vec = g_malloc(DIV_ROUND_UP(window_size, page_size));
   2829
   2830    end = raw_getlength(bs);
   2831
   2832    for (offset = 0; offset < end; offset += window_size) {
   2833        void *new_window;
   2834        size_t new_length;
   2835        size_t vec_end;
   2836        size_t i;
   2837        int ret;
   2838
   2839        /* Unmap previous window if size has changed */
   2840        new_length = MIN(end - offset, window_size);
   2841        if (new_length != length) {
   2842            munmap(window, length);
   2843            window = NULL;
   2844            length = 0;
   2845        }
   2846
   2847        new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
   2848                          s->fd, offset);
   2849        if (new_window == MAP_FAILED) {
   2850            error_setg_errno(errp, errno, "mmap failed");
   2851            break;
   2852        }
   2853
   2854        window = new_window;
   2855        length = new_length;
   2856
   2857        ret = mincore(window, length, vec);
   2858        if (ret < 0) {
   2859            error_setg_errno(errp, errno, "mincore failed");
   2860            break;
   2861        }
   2862
   2863        vec_end = DIV_ROUND_UP(length, page_size);
   2864        for (i = 0; i < vec_end; i++) {
   2865            if (vec[i] & 0x1) {
   2866                break;
   2867            }
   2868        }
   2869        if (i < vec_end) {
   2870            error_setg(errp, "page cache still in use!");
   2871            break;
   2872        }
   2873    }
   2874
   2875    if (window) {
   2876        munmap(window, length);
   2877    }
   2878
   2879    g_free(vec);
   2880}
   2881#endif /* __linux__ */
   2882
   2883static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
   2884                                                 Error **errp)
   2885{
   2886    BDRVRawState *s = bs->opaque;
   2887    int ret;
   2888
   2889    ret = fd_open(bs);
   2890    if (ret < 0) {
   2891        error_setg_errno(errp, -ret, "The file descriptor is not open");
   2892        return;
   2893    }
   2894
   2895    if (!s->drop_cache) {
   2896        return;
   2897    }
   2898
   2899    if (s->open_flags & O_DIRECT) {
   2900        return; /* No host kernel page cache */
   2901    }
   2902
   2903#if defined(__linux__)
   2904    /* This sets the scene for the next syscall... */
   2905    ret = bdrv_co_flush(bs);
   2906    if (ret < 0) {
   2907        error_setg_errno(errp, -ret, "flush failed");
   2908        return;
   2909    }
   2910
   2911    /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
   2912     * process.  These limitations are okay because we just fsynced the file,
   2913     * we don't use mmap, and the file should not be in use by other processes.
   2914     */
   2915    ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
   2916    if (ret != 0) { /* the return value is a positive errno */
   2917        error_setg_errno(errp, ret, "fadvise failed");
   2918        return;
   2919    }
   2920
   2921    if (s->check_cache_dropped) {
   2922        check_cache_dropped(bs, errp);
   2923    }
   2924#else /* __linux__ */
   2925    /* Do nothing.  Live migration to a remote host with cache.direct=off is
   2926     * unsupported on other host operating systems.  Cache consistency issues
   2927     * may occur but no error is reported here, partly because that's the
   2928     * historical behavior and partly because it's hard to differentiate valid
   2929     * configurations that should not cause errors.
   2930     */
   2931#endif /* !__linux__ */
   2932}
   2933
   2934static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
   2935{
   2936    if (ret) {
   2937        s->stats.discard_nb_failed++;
   2938    } else {
   2939        s->stats.discard_nb_ok++;
   2940        s->stats.discard_bytes_ok += nbytes;
   2941    }
   2942}
   2943
   2944static coroutine_fn int
   2945raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
   2946                bool blkdev)
   2947{
   2948    BDRVRawState *s = bs->opaque;
   2949    RawPosixAIOData acb;
   2950    int ret;
   2951
   2952    acb = (RawPosixAIOData) {
   2953        .bs             = bs,
   2954        .aio_fildes     = s->fd,
   2955        .aio_type       = QEMU_AIO_DISCARD,
   2956        .aio_offset     = offset,
   2957        .aio_nbytes     = bytes,
   2958    };
   2959
   2960    if (blkdev) {
   2961        acb.aio_type |= QEMU_AIO_BLKDEV;
   2962    }
   2963
   2964    ret = raw_thread_pool_submit(bs, handle_aiocb_discard, &acb);
   2965    raw_account_discard(s, bytes, ret);
   2966    return ret;
   2967}
   2968
   2969static coroutine_fn int
   2970raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
   2971{
   2972    return raw_do_pdiscard(bs, offset, bytes, false);
   2973}
   2974
   2975static int coroutine_fn
   2976raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
   2977                     BdrvRequestFlags flags, bool blkdev)
   2978{
   2979    BDRVRawState *s = bs->opaque;
   2980    RawPosixAIOData acb;
   2981    ThreadPoolFunc *handler;
   2982
   2983#ifdef CONFIG_FALLOCATE
   2984    if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
   2985        BdrvTrackedRequest *req;
   2986
   2987        /*
   2988         * This is a workaround for a bug in the Linux XFS driver,
   2989         * where writes submitted through the AIO interface will be
   2990         * discarded if they happen beyond a concurrently running
   2991         * fallocate() that increases the file length (i.e., both the
   2992         * write and the fallocate() happen beyond the EOF).
   2993         *
   2994         * To work around it, we extend the tracked request for this
   2995         * zero write until INT64_MAX (effectively infinity), and mark
   2996         * it as serializing.
   2997         *
   2998         * We have to enable this workaround for all filesystems and
   2999         * AIO modes (not just XFS with aio=native), because for
   3000         * remote filesystems we do not know the host configuration.
   3001         */
   3002
   3003        req = bdrv_co_get_self_request(bs);
   3004        assert(req);
   3005        assert(req->type == BDRV_TRACKED_WRITE);
   3006        assert(req->offset <= offset);
   3007        assert(req->offset + req->bytes >= offset + bytes);
   3008
   3009        req->bytes = BDRV_MAX_LENGTH - req->offset;
   3010
   3011        bdrv_check_request(req->offset, req->bytes, &error_abort);
   3012
   3013        bdrv_make_request_serialising(req, bs->bl.request_alignment);
   3014    }
   3015#endif
   3016
   3017    acb = (RawPosixAIOData) {
   3018        .bs             = bs,
   3019        .aio_fildes     = s->fd,
   3020        .aio_type       = QEMU_AIO_WRITE_ZEROES,
   3021        .aio_offset     = offset,
   3022        .aio_nbytes     = bytes,
   3023    };
   3024
   3025    if (blkdev) {
   3026        acb.aio_type |= QEMU_AIO_BLKDEV;
   3027    }
   3028    if (flags & BDRV_REQ_NO_FALLBACK) {
   3029        acb.aio_type |= QEMU_AIO_NO_FALLBACK;
   3030    }
   3031
   3032    if (flags & BDRV_REQ_MAY_UNMAP) {
   3033        acb.aio_type |= QEMU_AIO_DISCARD;
   3034        handler = handle_aiocb_write_zeroes_unmap;
   3035    } else {
   3036        handler = handle_aiocb_write_zeroes;
   3037    }
   3038
   3039    return raw_thread_pool_submit(bs, handler, &acb);
   3040}
   3041
   3042static int coroutine_fn raw_co_pwrite_zeroes(
   3043    BlockDriverState *bs, int64_t offset,
   3044    int64_t bytes, BdrvRequestFlags flags)
   3045{
   3046    return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false);
   3047}
   3048
   3049static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
   3050{
   3051    return 0;
   3052}
   3053
   3054static BlockStatsSpecificFile get_blockstats_specific_file(BlockDriverState *bs)
   3055{
   3056    BDRVRawState *s = bs->opaque;
   3057    return (BlockStatsSpecificFile) {
   3058        .discard_nb_ok = s->stats.discard_nb_ok,
   3059        .discard_nb_failed = s->stats.discard_nb_failed,
   3060        .discard_bytes_ok = s->stats.discard_bytes_ok,
   3061    };
   3062}
   3063
   3064static BlockStatsSpecific *raw_get_specific_stats(BlockDriverState *bs)
   3065{
   3066    BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
   3067
   3068    stats->driver = BLOCKDEV_DRIVER_FILE;
   3069    stats->u.file = get_blockstats_specific_file(bs);
   3070
   3071    return stats;
   3072}
   3073
   3074#if defined(HAVE_HOST_BLOCK_DEVICE)
   3075static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
   3076{
   3077    BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
   3078
   3079    stats->driver = BLOCKDEV_DRIVER_HOST_DEVICE;
   3080    stats->u.host_device = get_blockstats_specific_file(bs);
   3081
   3082    return stats;
   3083}
   3084#endif /* HAVE_HOST_BLOCK_DEVICE */
   3085
   3086static QemuOptsList raw_create_opts = {
   3087    .name = "raw-create-opts",
   3088    .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
   3089    .desc = {
   3090        {
   3091            .name = BLOCK_OPT_SIZE,
   3092            .type = QEMU_OPT_SIZE,
   3093            .help = "Virtual disk size"
   3094        },
   3095        {
   3096            .name = BLOCK_OPT_NOCOW,
   3097            .type = QEMU_OPT_BOOL,
   3098            .help = "Turn off copy-on-write (valid only on btrfs)"
   3099        },
   3100        {
   3101            .name = BLOCK_OPT_PREALLOC,
   3102            .type = QEMU_OPT_STRING,
   3103            .help = "Preallocation mode (allowed values: off"
   3104#ifdef CONFIG_POSIX_FALLOCATE
   3105                    ", falloc"
   3106#endif
   3107                    ", full)"
   3108        },
   3109        {
   3110            .name = BLOCK_OPT_EXTENT_SIZE_HINT,
   3111            .type = QEMU_OPT_SIZE,
   3112            .help = "Extent size hint for the image file, 0 to disable"
   3113        },
   3114        { /* end of list */ }
   3115    }
   3116};
   3117
   3118static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
   3119                          Error **errp)
   3120{
   3121    BDRVRawState *s = bs->opaque;
   3122    int input_flags = s->reopen_state ? s->reopen_state->flags : bs->open_flags;
   3123    int open_flags;
   3124    int ret;
   3125
   3126    /* We may need a new fd if auto-read-only switches the mode */
   3127    ret = raw_reconfigure_getfd(bs, input_flags, &open_flags, perm,
   3128                                false, errp);
   3129    if (ret < 0) {
   3130        return ret;
   3131    } else if (ret != s->fd) {
   3132        Error *local_err = NULL;
   3133
   3134        /*
   3135         * Fail already check_perm() if we can't get a working O_DIRECT
   3136         * alignment with the new fd.
   3137         */
   3138        raw_probe_alignment(bs, ret, &local_err);
   3139        if (local_err) {
   3140            error_propagate(errp, local_err);
   3141            return -EINVAL;
   3142        }
   3143
   3144        s->perm_change_fd = ret;
   3145        s->perm_change_flags = open_flags;
   3146    }
   3147
   3148    /* Prepare permissions on old fd to avoid conflicts between old and new,
   3149     * but keep everything locked that new will need. */
   3150    ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
   3151    if (ret < 0) {
   3152        goto fail;
   3153    }
   3154
   3155    /* Copy locks to the new fd */
   3156    if (s->perm_change_fd && s->use_lock) {
   3157        ret = raw_apply_lock_bytes(NULL, s->perm_change_fd, perm, ~shared,
   3158                                   false, errp);
   3159        if (ret < 0) {
   3160            raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
   3161            goto fail;
   3162        }
   3163    }
   3164    return 0;
   3165
   3166fail:
   3167    if (s->perm_change_fd) {
   3168        qemu_close(s->perm_change_fd);
   3169    }
   3170    s->perm_change_fd = 0;
   3171    return ret;
   3172}
   3173
   3174static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
   3175{
   3176    BDRVRawState *s = bs->opaque;
   3177
   3178    /* For reopen, we have already switched to the new fd (.bdrv_set_perm is
   3179     * called after .bdrv_reopen_commit) */
   3180    if (s->perm_change_fd && s->fd != s->perm_change_fd) {
   3181        qemu_close(s->fd);
   3182        s->fd = s->perm_change_fd;
   3183        s->open_flags = s->perm_change_flags;
   3184    }
   3185    s->perm_change_fd = 0;
   3186
   3187    raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
   3188    s->perm = perm;
   3189    s->shared_perm = shared;
   3190}
   3191
   3192static void raw_abort_perm_update(BlockDriverState *bs)
   3193{
   3194    BDRVRawState *s = bs->opaque;
   3195
   3196    /* For reopen, .bdrv_reopen_abort is called afterwards and will close
   3197     * the file descriptor. */
   3198    if (s->perm_change_fd) {
   3199        qemu_close(s->perm_change_fd);
   3200    }
   3201    s->perm_change_fd = 0;
   3202
   3203    raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
   3204}
   3205
   3206static int coroutine_fn raw_co_copy_range_from(
   3207        BlockDriverState *bs, BdrvChild *src, int64_t src_offset,
   3208        BdrvChild *dst, int64_t dst_offset, int64_t bytes,
   3209        BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
   3210{
   3211    return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
   3212                                 read_flags, write_flags);
   3213}
   3214
   3215static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
   3216                                             BdrvChild *src,
   3217                                             int64_t src_offset,
   3218                                             BdrvChild *dst,
   3219                                             int64_t dst_offset,
   3220                                             int64_t bytes,
   3221                                             BdrvRequestFlags read_flags,
   3222                                             BdrvRequestFlags write_flags)
   3223{
   3224    RawPosixAIOData acb;
   3225    BDRVRawState *s = bs->opaque;
   3226    BDRVRawState *src_s;
   3227
   3228    assert(dst->bs == bs);
   3229    if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
   3230        return -ENOTSUP;
   3231    }
   3232
   3233    src_s = src->bs->opaque;
   3234    if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) {
   3235        return -EIO;
   3236    }
   3237
   3238    acb = (RawPosixAIOData) {
   3239        .bs             = bs,
   3240        .aio_type       = QEMU_AIO_COPY_RANGE,
   3241        .aio_fildes     = src_s->fd,
   3242        .aio_offset     = src_offset,
   3243        .aio_nbytes     = bytes,
   3244        .copy_range     = {
   3245            .aio_fd2        = s->fd,
   3246            .aio_offset2    = dst_offset,
   3247        },
   3248    };
   3249
   3250    return raw_thread_pool_submit(bs, handle_aiocb_copy_range, &acb);
   3251}
   3252
   3253BlockDriver bdrv_file = {
   3254    .format_name = "file",
   3255    .protocol_name = "file",
   3256    .instance_size = sizeof(BDRVRawState),
   3257    .bdrv_needs_filename = true,
   3258    .bdrv_probe = NULL, /* no probe for protocols */
   3259    .bdrv_parse_filename = raw_parse_filename,
   3260    .bdrv_file_open = raw_open,
   3261    .bdrv_reopen_prepare = raw_reopen_prepare,
   3262    .bdrv_reopen_commit = raw_reopen_commit,
   3263    .bdrv_reopen_abort = raw_reopen_abort,
   3264    .bdrv_close = raw_close,
   3265    .bdrv_co_create = raw_co_create,
   3266    .bdrv_co_create_opts = raw_co_create_opts,
   3267    .bdrv_has_zero_init = bdrv_has_zero_init_1,
   3268    .bdrv_co_block_status = raw_co_block_status,
   3269    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
   3270    .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
   3271    .bdrv_co_delete_file = raw_co_delete_file,
   3272
   3273    .bdrv_co_preadv         = raw_co_preadv,
   3274    .bdrv_co_pwritev        = raw_co_pwritev,
   3275    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
   3276    .bdrv_co_pdiscard       = raw_co_pdiscard,
   3277    .bdrv_co_copy_range_from = raw_co_copy_range_from,
   3278    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
   3279    .bdrv_refresh_limits = raw_refresh_limits,
   3280    .bdrv_io_plug = raw_aio_plug,
   3281    .bdrv_io_unplug = raw_aio_unplug,
   3282    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
   3283
   3284    .bdrv_co_truncate = raw_co_truncate,
   3285    .bdrv_getlength = raw_getlength,
   3286    .bdrv_get_info = raw_get_info,
   3287    .bdrv_get_allocated_file_size
   3288                        = raw_get_allocated_file_size,
   3289    .bdrv_get_specific_stats = raw_get_specific_stats,
   3290    .bdrv_check_perm = raw_check_perm,
   3291    .bdrv_set_perm   = raw_set_perm,
   3292    .bdrv_abort_perm_update = raw_abort_perm_update,
   3293    .create_opts = &raw_create_opts,
   3294    .mutable_opts = mutable_opts,
   3295};
   3296
   3297/***********************************************/
   3298/* host device */
   3299
   3300#if defined(HAVE_HOST_BLOCK_DEVICE)
   3301
   3302#if defined(__APPLE__) && defined(__MACH__)
   3303static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
   3304                                CFIndex maxPathSize, int flags);
   3305static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
   3306{
   3307    kern_return_t kernResult = KERN_FAILURE;
   3308    mach_port_t     masterPort;
   3309    CFMutableDictionaryRef  classesToMatch;
   3310    const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
   3311    char *mediaType = NULL;
   3312
   3313    kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
   3314    if ( KERN_SUCCESS != kernResult ) {
   3315        printf( "IOMasterPort returned %d\n", kernResult );
   3316    }
   3317
   3318    int index;
   3319    for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
   3320        classesToMatch = IOServiceMatching(matching_array[index]);
   3321        if (classesToMatch == NULL) {
   3322            error_report("IOServiceMatching returned NULL for %s",
   3323                         matching_array[index]);
   3324            continue;
   3325        }
   3326        CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
   3327                             kCFBooleanTrue);
   3328        kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
   3329                                                  mediaIterator);
   3330        if (kernResult != KERN_SUCCESS) {
   3331            error_report("Note: IOServiceGetMatchingServices returned %d",
   3332                         kernResult);
   3333            continue;
   3334        }
   3335
   3336        /* If a match was found, leave the loop */
   3337        if (*mediaIterator != 0) {
   3338            trace_file_FindEjectableOpticalMedia(matching_array[index]);
   3339            mediaType = g_strdup(matching_array[index]);
   3340            break;
   3341        }
   3342    }
   3343    return mediaType;
   3344}
   3345
   3346kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
   3347                         CFIndex maxPathSize, int flags)
   3348{
   3349    io_object_t     nextMedia;
   3350    kern_return_t   kernResult = KERN_FAILURE;
   3351    *bsdPath = '\0';
   3352    nextMedia = IOIteratorNext( mediaIterator );
   3353    if ( nextMedia )
   3354    {
   3355        CFTypeRef   bsdPathAsCFString;
   3356    bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
   3357        if ( bsdPathAsCFString ) {
   3358            size_t devPathLength;
   3359            strcpy( bsdPath, _PATH_DEV );
   3360            if (flags & BDRV_O_NOCACHE) {
   3361                strcat(bsdPath, "r");
   3362            }
   3363            devPathLength = strlen( bsdPath );
   3364            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
   3365                kernResult = KERN_SUCCESS;
   3366            }
   3367            CFRelease( bsdPathAsCFString );
   3368        }
   3369        IOObjectRelease( nextMedia );
   3370    }
   3371
   3372    return kernResult;
   3373}
   3374
   3375/* Sets up a real cdrom for use in QEMU */
   3376static bool setup_cdrom(char *bsd_path, Error **errp)
   3377{
   3378    int index, num_of_test_partitions = 2, fd;
   3379    char test_partition[MAXPATHLEN];
   3380    bool partition_found = false;
   3381
   3382    /* look for a working partition */
   3383    for (index = 0; index < num_of_test_partitions; index++) {
   3384        snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
   3385                 index);
   3386        fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE, NULL);
   3387        if (fd >= 0) {
   3388            partition_found = true;
   3389            qemu_close(fd);
   3390            break;
   3391        }
   3392    }
   3393
   3394    /* if a working partition on the device was not found */
   3395    if (partition_found == false) {
   3396        error_setg(errp, "Failed to find a working partition on disc");
   3397    } else {
   3398        trace_file_setup_cdrom(test_partition);
   3399        pstrcpy(bsd_path, MAXPATHLEN, test_partition);
   3400    }
   3401    return partition_found;
   3402}
   3403
   3404/* Prints directions on mounting and unmounting a device */
   3405static void print_unmounting_directions(const char *file_name)
   3406{
   3407    error_report("If device %s is mounted on the desktop, unmount"
   3408                 " it first before using it in QEMU", file_name);
   3409    error_report("Command to unmount device: diskutil unmountDisk %s",
   3410                 file_name);
   3411    error_report("Command to mount device: diskutil mountDisk %s", file_name);
   3412}
   3413
   3414#endif /* defined(__APPLE__) && defined(__MACH__) */
   3415
   3416static int hdev_probe_device(const char *filename)
   3417{
   3418    struct stat st;
   3419
   3420    /* allow a dedicated CD-ROM driver to match with a higher priority */
   3421    if (strstart(filename, "/dev/cdrom", NULL))
   3422        return 50;
   3423
   3424    if (stat(filename, &st) >= 0 &&
   3425            (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
   3426        return 100;
   3427    }
   3428
   3429    return 0;
   3430}
   3431
   3432static void hdev_parse_filename(const char *filename, QDict *options,
   3433                                Error **errp)
   3434{
   3435    bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
   3436}
   3437
   3438static bool hdev_is_sg(BlockDriverState *bs)
   3439{
   3440
   3441#if defined(__linux__)
   3442
   3443    BDRVRawState *s = bs->opaque;
   3444    struct stat st;
   3445    struct sg_scsi_id scsiid;
   3446    int sg_version;
   3447    int ret;
   3448
   3449    if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
   3450        return false;
   3451    }
   3452
   3453    ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
   3454    if (ret < 0) {
   3455        return false;
   3456    }
   3457
   3458    ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
   3459    if (ret >= 0) {
   3460        trace_file_hdev_is_sg(scsiid.scsi_type, sg_version);
   3461        return true;
   3462    }
   3463
   3464#endif
   3465
   3466    return false;
   3467}
   3468
   3469static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
   3470                     Error **errp)
   3471{
   3472    BDRVRawState *s = bs->opaque;
   3473    int ret;
   3474
   3475#if defined(__APPLE__) && defined(__MACH__)
   3476    /*
   3477     * Caution: while qdict_get_str() is fine, getting non-string types
   3478     * would require more care.  When @options come from -blockdev or
   3479     * blockdev_add, its members are typed according to the QAPI
   3480     * schema, but when they come from -drive, they're all QString.
   3481     */
   3482    const char *filename = qdict_get_str(options, "filename");
   3483    char bsd_path[MAXPATHLEN] = "";
   3484    bool error_occurred = false;
   3485
   3486    /* If using a real cdrom */
   3487    if (strcmp(filename, "/dev/cdrom") == 0) {
   3488        char *mediaType = NULL;
   3489        kern_return_t ret_val;
   3490        io_iterator_t mediaIterator = 0;
   3491
   3492        mediaType = FindEjectableOpticalMedia(&mediaIterator);
   3493        if (mediaType == NULL) {
   3494            error_setg(errp, "Please make sure your CD/DVD is in the optical"
   3495                       " drive");
   3496            error_occurred = true;
   3497            goto hdev_open_Mac_error;
   3498        }
   3499
   3500        ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
   3501        if (ret_val != KERN_SUCCESS) {
   3502            error_setg(errp, "Could not get BSD path for optical drive");
   3503            error_occurred = true;
   3504            goto hdev_open_Mac_error;
   3505        }
   3506
   3507        /* If a real optical drive was not found */
   3508        if (bsd_path[0] == '\0') {
   3509            error_setg(errp, "Failed to obtain bsd path for optical drive");
   3510            error_occurred = true;
   3511            goto hdev_open_Mac_error;
   3512        }
   3513
   3514        /* If using a cdrom disc and finding a partition on the disc failed */
   3515        if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
   3516            setup_cdrom(bsd_path, errp) == false) {
   3517            print_unmounting_directions(bsd_path);
   3518            error_occurred = true;
   3519            goto hdev_open_Mac_error;
   3520        }
   3521
   3522        qdict_put_str(options, "filename", bsd_path);
   3523
   3524hdev_open_Mac_error:
   3525        g_free(mediaType);
   3526        if (mediaIterator) {
   3527            IOObjectRelease(mediaIterator);
   3528        }
   3529        if (error_occurred) {
   3530            return -ENOENT;
   3531        }
   3532    }
   3533#endif /* defined(__APPLE__) && defined(__MACH__) */
   3534
   3535    s->type = FTYPE_FILE;
   3536
   3537    ret = raw_open_common(bs, options, flags, 0, true, errp);
   3538    if (ret < 0) {
   3539#if defined(__APPLE__) && defined(__MACH__)
   3540        if (*bsd_path) {
   3541            filename = bsd_path;
   3542        }
   3543        /* if a physical device experienced an error while being opened */
   3544        if (strncmp(filename, "/dev/", 5) == 0) {
   3545            print_unmounting_directions(filename);
   3546        }
   3547#endif /* defined(__APPLE__) && defined(__MACH__) */
   3548        return ret;
   3549    }
   3550
   3551    /* Since this does ioctl the device must be already opened */
   3552    bs->sg = hdev_is_sg(bs);
   3553
   3554    return ret;
   3555}
   3556
   3557#if defined(__linux__)
   3558static int coroutine_fn
   3559hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
   3560{
   3561    BDRVRawState *s = bs->opaque;
   3562    RawPosixAIOData acb;
   3563    int ret;
   3564
   3565    ret = fd_open(bs);
   3566    if (ret < 0) {
   3567        return ret;
   3568    }
   3569
   3570    if (req == SG_IO && s->pr_mgr) {
   3571        struct sg_io_hdr *io_hdr = buf;
   3572        if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
   3573            io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
   3574            return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
   3575                                      s->fd, io_hdr);
   3576        }
   3577    }
   3578
   3579    acb = (RawPosixAIOData) {
   3580        .bs         = bs,
   3581        .aio_type   = QEMU_AIO_IOCTL,
   3582        .aio_fildes = s->fd,
   3583        .aio_offset = 0,
   3584        .ioctl      = {
   3585            .buf        = buf,
   3586            .cmd        = req,
   3587        },
   3588    };
   3589
   3590    return raw_thread_pool_submit(bs, handle_aiocb_ioctl, &acb);
   3591}
   3592#endif /* linux */
   3593
   3594static coroutine_fn int
   3595hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
   3596{
   3597    BDRVRawState *s = bs->opaque;
   3598    int ret;
   3599
   3600    ret = fd_open(bs);
   3601    if (ret < 0) {
   3602        raw_account_discard(s, bytes, ret);
   3603        return ret;
   3604    }
   3605    return raw_do_pdiscard(bs, offset, bytes, true);
   3606}
   3607
   3608static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
   3609    int64_t offset, int64_t bytes, BdrvRequestFlags flags)
   3610{
   3611    int rc;
   3612
   3613    rc = fd_open(bs);
   3614    if (rc < 0) {
   3615        return rc;
   3616    }
   3617
   3618    return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true);
   3619}
   3620
   3621static BlockDriver bdrv_host_device = {
   3622    .format_name        = "host_device",
   3623    .protocol_name        = "host_device",
   3624    .instance_size      = sizeof(BDRVRawState),
   3625    .bdrv_needs_filename = true,
   3626    .bdrv_probe_device  = hdev_probe_device,
   3627    .bdrv_parse_filename = hdev_parse_filename,
   3628    .bdrv_file_open     = hdev_open,
   3629    .bdrv_close         = raw_close,
   3630    .bdrv_reopen_prepare = raw_reopen_prepare,
   3631    .bdrv_reopen_commit  = raw_reopen_commit,
   3632    .bdrv_reopen_abort   = raw_reopen_abort,
   3633    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
   3634    .create_opts         = &bdrv_create_opts_simple,
   3635    .mutable_opts        = mutable_opts,
   3636    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
   3637    .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
   3638
   3639    .bdrv_co_preadv         = raw_co_preadv,
   3640    .bdrv_co_pwritev        = raw_co_pwritev,
   3641    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
   3642    .bdrv_co_pdiscard       = hdev_co_pdiscard,
   3643    .bdrv_co_copy_range_from = raw_co_copy_range_from,
   3644    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
   3645    .bdrv_refresh_limits = raw_refresh_limits,
   3646    .bdrv_io_plug = raw_aio_plug,
   3647    .bdrv_io_unplug = raw_aio_unplug,
   3648    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
   3649
   3650    .bdrv_co_truncate       = raw_co_truncate,
   3651    .bdrv_getlength	= raw_getlength,
   3652    .bdrv_get_info = raw_get_info,
   3653    .bdrv_get_allocated_file_size
   3654                        = raw_get_allocated_file_size,
   3655    .bdrv_get_specific_stats = hdev_get_specific_stats,
   3656    .bdrv_check_perm = raw_check_perm,
   3657    .bdrv_set_perm   = raw_set_perm,
   3658    .bdrv_abort_perm_update = raw_abort_perm_update,
   3659    .bdrv_probe_blocksizes = hdev_probe_blocksizes,
   3660    .bdrv_probe_geometry = hdev_probe_geometry,
   3661
   3662    /* generic scsi device */
   3663#ifdef __linux__
   3664    .bdrv_co_ioctl          = hdev_co_ioctl,
   3665#endif
   3666};
   3667
   3668#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
   3669static void cdrom_parse_filename(const char *filename, QDict *options,
   3670                                 Error **errp)
   3671{
   3672    bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
   3673}
   3674#endif
   3675
   3676#ifdef __linux__
   3677static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
   3678                      Error **errp)
   3679{
   3680    BDRVRawState *s = bs->opaque;
   3681
   3682    s->type = FTYPE_CD;
   3683
   3684    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
   3685    return raw_open_common(bs, options, flags, O_NONBLOCK, true, errp);
   3686}
   3687
   3688static int cdrom_probe_device(const char *filename)
   3689{
   3690    int fd, ret;
   3691    int prio = 0;
   3692    struct stat st;
   3693
   3694    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK, NULL);
   3695    if (fd < 0) {
   3696        goto out;
   3697    }
   3698    ret = fstat(fd, &st);
   3699    if (ret == -1 || !S_ISBLK(st.st_mode)) {
   3700        goto outc;
   3701    }
   3702
   3703    /* Attempt to detect via a CDROM specific ioctl */
   3704    ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
   3705    if (ret >= 0)
   3706        prio = 100;
   3707
   3708outc:
   3709    qemu_close(fd);
   3710out:
   3711    return prio;
   3712}
   3713
   3714static bool cdrom_is_inserted(BlockDriverState *bs)
   3715{
   3716    BDRVRawState *s = bs->opaque;
   3717    int ret;
   3718
   3719    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
   3720    return ret == CDS_DISC_OK;
   3721}
   3722
   3723static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
   3724{
   3725    BDRVRawState *s = bs->opaque;
   3726
   3727    if (eject_flag) {
   3728        if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
   3729            perror("CDROMEJECT");
   3730    } else {
   3731        if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
   3732            perror("CDROMEJECT");
   3733    }
   3734}
   3735
   3736static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
   3737{
   3738    BDRVRawState *s = bs->opaque;
   3739
   3740    if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
   3741        /*
   3742         * Note: an error can happen if the distribution automatically
   3743         * mounts the CD-ROM
   3744         */
   3745        /* perror("CDROM_LOCKDOOR"); */
   3746    }
   3747}
   3748
   3749static BlockDriver bdrv_host_cdrom = {
   3750    .format_name        = "host_cdrom",
   3751    .protocol_name      = "host_cdrom",
   3752    .instance_size      = sizeof(BDRVRawState),
   3753    .bdrv_needs_filename = true,
   3754    .bdrv_probe_device	= cdrom_probe_device,
   3755    .bdrv_parse_filename = cdrom_parse_filename,
   3756    .bdrv_file_open     = cdrom_open,
   3757    .bdrv_close         = raw_close,
   3758    .bdrv_reopen_prepare = raw_reopen_prepare,
   3759    .bdrv_reopen_commit  = raw_reopen_commit,
   3760    .bdrv_reopen_abort   = raw_reopen_abort,
   3761    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
   3762    .create_opts         = &bdrv_create_opts_simple,
   3763    .mutable_opts        = mutable_opts,
   3764    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
   3765
   3766    .bdrv_co_preadv         = raw_co_preadv,
   3767    .bdrv_co_pwritev        = raw_co_pwritev,
   3768    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
   3769    .bdrv_refresh_limits = raw_refresh_limits,
   3770    .bdrv_io_plug = raw_aio_plug,
   3771    .bdrv_io_unplug = raw_aio_unplug,
   3772    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
   3773
   3774    .bdrv_co_truncate    = raw_co_truncate,
   3775    .bdrv_getlength      = raw_getlength,
   3776    .has_variable_length = true,
   3777    .bdrv_get_allocated_file_size
   3778                        = raw_get_allocated_file_size,
   3779
   3780    /* removable device support */
   3781    .bdrv_is_inserted   = cdrom_is_inserted,
   3782    .bdrv_eject         = cdrom_eject,
   3783    .bdrv_lock_medium   = cdrom_lock_medium,
   3784
   3785    /* generic scsi device */
   3786    .bdrv_co_ioctl      = hdev_co_ioctl,
   3787};
   3788#endif /* __linux__ */
   3789
   3790#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
   3791static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
   3792                      Error **errp)
   3793{
   3794    BDRVRawState *s = bs->opaque;
   3795    int ret;
   3796
   3797    s->type = FTYPE_CD;
   3798
   3799    ret = raw_open_common(bs, options, flags, 0, true, errp);
   3800    if (ret) {
   3801        return ret;
   3802    }
   3803
   3804    /* make sure the door isn't locked at this time */
   3805    ioctl(s->fd, CDIOCALLOW);
   3806    return 0;
   3807}
   3808
   3809static int cdrom_probe_device(const char *filename)
   3810{
   3811    if (strstart(filename, "/dev/cd", NULL) ||
   3812            strstart(filename, "/dev/acd", NULL))
   3813        return 100;
   3814    return 0;
   3815}
   3816
   3817static int cdrom_reopen(BlockDriverState *bs)
   3818{
   3819    BDRVRawState *s = bs->opaque;
   3820    int fd;
   3821
   3822    /*
   3823     * Force reread of possibly changed/newly loaded disc,
   3824     * FreeBSD seems to not notice sometimes...
   3825     */
   3826    if (s->fd >= 0)
   3827        qemu_close(s->fd);
   3828    fd = qemu_open(bs->filename, s->open_flags, NULL);
   3829    if (fd < 0) {
   3830        s->fd = -1;
   3831        return -EIO;
   3832    }
   3833    s->fd = fd;
   3834
   3835    /* make sure the door isn't locked at this time */
   3836    ioctl(s->fd, CDIOCALLOW);
   3837    return 0;
   3838}
   3839
   3840static bool cdrom_is_inserted(BlockDriverState *bs)
   3841{
   3842    return raw_getlength(bs) > 0;
   3843}
   3844
   3845static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
   3846{
   3847    BDRVRawState *s = bs->opaque;
   3848
   3849    if (s->fd < 0)
   3850        return;
   3851
   3852    (void) ioctl(s->fd, CDIOCALLOW);
   3853
   3854    if (eject_flag) {
   3855        if (ioctl(s->fd, CDIOCEJECT) < 0)
   3856            perror("CDIOCEJECT");
   3857    } else {
   3858        if (ioctl(s->fd, CDIOCCLOSE) < 0)
   3859            perror("CDIOCCLOSE");
   3860    }
   3861
   3862    cdrom_reopen(bs);
   3863}
   3864
   3865static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
   3866{
   3867    BDRVRawState *s = bs->opaque;
   3868
   3869    if (s->fd < 0)
   3870        return;
   3871    if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
   3872        /*
   3873         * Note: an error can happen if the distribution automatically
   3874         * mounts the CD-ROM
   3875         */
   3876        /* perror("CDROM_LOCKDOOR"); */
   3877    }
   3878}
   3879
   3880static BlockDriver bdrv_host_cdrom = {
   3881    .format_name        = "host_cdrom",
   3882    .protocol_name      = "host_cdrom",
   3883    .instance_size      = sizeof(BDRVRawState),
   3884    .bdrv_needs_filename = true,
   3885    .bdrv_probe_device	= cdrom_probe_device,
   3886    .bdrv_parse_filename = cdrom_parse_filename,
   3887    .bdrv_file_open     = cdrom_open,
   3888    .bdrv_close         = raw_close,
   3889    .bdrv_reopen_prepare = raw_reopen_prepare,
   3890    .bdrv_reopen_commit  = raw_reopen_commit,
   3891    .bdrv_reopen_abort   = raw_reopen_abort,
   3892    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
   3893    .create_opts         = &bdrv_create_opts_simple,
   3894    .mutable_opts       = mutable_opts,
   3895
   3896    .bdrv_co_preadv         = raw_co_preadv,
   3897    .bdrv_co_pwritev        = raw_co_pwritev,
   3898    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
   3899    .bdrv_refresh_limits = raw_refresh_limits,
   3900    .bdrv_io_plug = raw_aio_plug,
   3901    .bdrv_io_unplug = raw_aio_unplug,
   3902    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
   3903
   3904    .bdrv_co_truncate    = raw_co_truncate,
   3905    .bdrv_getlength      = raw_getlength,
   3906    .has_variable_length = true,
   3907    .bdrv_get_allocated_file_size
   3908                        = raw_get_allocated_file_size,
   3909
   3910    /* removable device support */
   3911    .bdrv_is_inserted   = cdrom_is_inserted,
   3912    .bdrv_eject         = cdrom_eject,
   3913    .bdrv_lock_medium   = cdrom_lock_medium,
   3914};
   3915#endif /* __FreeBSD__ */
   3916
   3917#endif /* HAVE_HOST_BLOCK_DEVICE */
   3918
   3919static void bdrv_file_init(void)
   3920{
   3921    /*
   3922     * Register all the drivers.  Note that order is important, the driver
   3923     * registered last will get probed first.
   3924     */
   3925    bdrv_register(&bdrv_file);
   3926#if defined(HAVE_HOST_BLOCK_DEVICE)
   3927    bdrv_register(&bdrv_host_device);
   3928#ifdef __linux__
   3929    bdrv_register(&bdrv_host_cdrom);
   3930#endif
   3931#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
   3932    bdrv_register(&bdrv_host_cdrom);
   3933#endif
   3934#endif /* HAVE_HOST_BLOCK_DEVICE */
   3935}
   3936
   3937block_init(bdrv_file_init);