cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

server.c (87572B)


      1/*
      2 *  Copyright (C) 2016-2020 Red Hat, Inc.
      3 *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
      4 *
      5 *  Network Block Device Server Side
      6 *
      7 *  This program is free software; you can redistribute it and/or modify
      8 *  it under the terms of the GNU General Public License as published by
      9 *  the Free Software Foundation; under version 2 of the License.
     10 *
     11 *  This program is distributed in the hope that it will be useful,
     12 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 *  GNU General Public License for more details.
     15 *
     16 *  You should have received a copy of the GNU General Public License
     17 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
     18 */
     19
     20#include "qemu/osdep.h"
     21
     22#include "block/export.h"
     23#include "qapi/error.h"
     24#include "qemu/queue.h"
     25#include "trace.h"
     26#include "nbd-internal.h"
     27#include "qemu/units.h"
     28
     29#define NBD_META_ID_BASE_ALLOCATION 0
     30#define NBD_META_ID_ALLOCATION_DEPTH 1
     31/* Dirty bitmaps use 'NBD_META_ID_DIRTY_BITMAP + i', so keep this id last. */
     32#define NBD_META_ID_DIRTY_BITMAP 2
     33
     34/*
     35 * NBD_MAX_BLOCK_STATUS_EXTENTS: 1 MiB of extents data. An empirical
     36 * constant. If an increase is needed, note that the NBD protocol
     37 * recommends no larger than 32 mb, so that the client won't consider
     38 * the reply as a denial of service attack.
     39 */
     40#define NBD_MAX_BLOCK_STATUS_EXTENTS (1 * MiB / 8)
     41
     42static int system_errno_to_nbd_errno(int err)
     43{
     44    switch (err) {
     45    case 0:
     46        return NBD_SUCCESS;
     47    case EPERM:
     48    case EROFS:
     49        return NBD_EPERM;
     50    case EIO:
     51        return NBD_EIO;
     52    case ENOMEM:
     53        return NBD_ENOMEM;
     54#ifdef EDQUOT
     55    case EDQUOT:
     56#endif
     57    case EFBIG:
     58    case ENOSPC:
     59        return NBD_ENOSPC;
     60    case EOVERFLOW:
     61        return NBD_EOVERFLOW;
     62    case ENOTSUP:
     63#if ENOTSUP != EOPNOTSUPP
     64    case EOPNOTSUPP:
     65#endif
     66        return NBD_ENOTSUP;
     67    case ESHUTDOWN:
     68        return NBD_ESHUTDOWN;
     69    case EINVAL:
     70    default:
     71        return NBD_EINVAL;
     72    }
     73}
     74
     75/* Definitions for opaque data types */
     76
     77typedef struct NBDRequestData NBDRequestData;
     78
     79struct NBDRequestData {
     80    QSIMPLEQ_ENTRY(NBDRequestData) entry;
     81    NBDClient *client;
     82    uint8_t *data;
     83    bool complete;
     84};
     85
     86struct NBDExport {
     87    BlockExport common;
     88
     89    char *name;
     90    char *description;
     91    uint64_t size;
     92    uint16_t nbdflags;
     93    QTAILQ_HEAD(, NBDClient) clients;
     94    QTAILQ_ENTRY(NBDExport) next;
     95
     96    BlockBackend *eject_notifier_blk;
     97    Notifier eject_notifier;
     98
     99    bool allocation_depth;
    100    BdrvDirtyBitmap **export_bitmaps;
    101    size_t nr_export_bitmaps;
    102};
    103
    104static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
    105
    106/* NBDExportMetaContexts represents a list of contexts to be exported,
    107 * as selected by NBD_OPT_SET_META_CONTEXT. Also used for
    108 * NBD_OPT_LIST_META_CONTEXT. */
    109typedef struct NBDExportMetaContexts {
    110    NBDExport *exp;
    111    size_t count; /* number of negotiated contexts */
    112    bool base_allocation; /* export base:allocation context (block status) */
    113    bool allocation_depth; /* export qemu:allocation-depth */
    114    bool *bitmaps; /*
    115                    * export qemu:dirty-bitmap:<export bitmap name>,
    116                    * sized by exp->nr_export_bitmaps
    117                    */
    118} NBDExportMetaContexts;
    119
    120struct NBDClient {
    121    int refcount;
    122    void (*close_fn)(NBDClient *client, bool negotiated);
    123
    124    NBDExport *exp;
    125    QCryptoTLSCreds *tlscreds;
    126    char *tlsauthz;
    127    QIOChannelSocket *sioc; /* The underlying data channel */
    128    QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
    129
    130    Coroutine *recv_coroutine;
    131
    132    CoMutex send_lock;
    133    Coroutine *send_coroutine;
    134
    135    bool read_yielding;
    136    bool quiescing;
    137
    138    QTAILQ_ENTRY(NBDClient) next;
    139    int nb_requests;
    140    bool closing;
    141
    142    uint32_t check_align; /* If non-zero, check for aligned client requests */
    143
    144    bool structured_reply;
    145    NBDExportMetaContexts export_meta;
    146
    147    uint32_t opt; /* Current option being negotiated */
    148    uint32_t optlen; /* remaining length of data in ioc for the option being
    149                        negotiated now */
    150};
    151
    152static void nbd_client_receive_next_request(NBDClient *client);
    153
    154/* Basic flow for negotiation
    155
    156   Server         Client
    157   Negotiate
    158
    159   or
    160
    161   Server         Client
    162   Negotiate #1
    163                  Option
    164   Negotiate #2
    165
    166   ----
    167
    168   followed by
    169
    170   Server         Client
    171                  Request
    172   Response
    173                  Request
    174   Response
    175                  ...
    176   ...
    177                  Request (type == 2)
    178
    179*/
    180
    181static inline void set_be_option_rep(NBDOptionReply *rep, uint32_t option,
    182                                     uint32_t type, uint32_t length)
    183{
    184    stq_be_p(&rep->magic, NBD_REP_MAGIC);
    185    stl_be_p(&rep->option, option);
    186    stl_be_p(&rep->type, type);
    187    stl_be_p(&rep->length, length);
    188}
    189
    190/* Send a reply header, including length, but no payload.
    191 * Return -errno on error, 0 on success. */
    192static int nbd_negotiate_send_rep_len(NBDClient *client, uint32_t type,
    193                                      uint32_t len, Error **errp)
    194{
    195    NBDOptionReply rep;
    196
    197    trace_nbd_negotiate_send_rep_len(client->opt, nbd_opt_lookup(client->opt),
    198                                     type, nbd_rep_lookup(type), len);
    199
    200    assert(len < NBD_MAX_BUFFER_SIZE);
    201
    202    set_be_option_rep(&rep, client->opt, type, len);
    203    return nbd_write(client->ioc, &rep, sizeof(rep), errp);
    204}
    205
    206/* Send a reply header with default 0 length.
    207 * Return -errno on error, 0 on success. */
    208static int nbd_negotiate_send_rep(NBDClient *client, uint32_t type,
    209                                  Error **errp)
    210{
    211    return nbd_negotiate_send_rep_len(client, type, 0, errp);
    212}
    213
    214/* Send an error reply.
    215 * Return -errno on error, 0 on success. */
    216static int GCC_FMT_ATTR(4, 0)
    217nbd_negotiate_send_rep_verr(NBDClient *client, uint32_t type,
    218                            Error **errp, const char *fmt, va_list va)
    219{
    220    ERRP_GUARD();
    221    g_autofree char *msg = NULL;
    222    int ret;
    223    size_t len;
    224
    225    msg = g_strdup_vprintf(fmt, va);
    226    len = strlen(msg);
    227    assert(len < NBD_MAX_STRING_SIZE);
    228    trace_nbd_negotiate_send_rep_err(msg);
    229    ret = nbd_negotiate_send_rep_len(client, type, len, errp);
    230    if (ret < 0) {
    231        return ret;
    232    }
    233    if (nbd_write(client->ioc, msg, len, errp) < 0) {
    234        error_prepend(errp, "write failed (error message): ");
    235        return -EIO;
    236    }
    237
    238    return 0;
    239}
    240
    241/*
    242 * Return a malloc'd copy of @name suitable for use in an error reply.
    243 */
    244static char *
    245nbd_sanitize_name(const char *name)
    246{
    247    if (strnlen(name, 80) < 80) {
    248        return g_strdup(name);
    249    }
    250    /* XXX Should we also try to sanitize any control characters? */
    251    return g_strdup_printf("%.80s...", name);
    252}
    253
    254/* Send an error reply.
    255 * Return -errno on error, 0 on success. */
    256static int GCC_FMT_ATTR(4, 5)
    257nbd_negotiate_send_rep_err(NBDClient *client, uint32_t type,
    258                           Error **errp, const char *fmt, ...)
    259{
    260    va_list va;
    261    int ret;
    262
    263    va_start(va, fmt);
    264    ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va);
    265    va_end(va);
    266    return ret;
    267}
    268
    269/* Drop remainder of the current option, and send a reply with the
    270 * given error type and message. Return -errno on read or write
    271 * failure; or 0 if connection is still live. */
    272static int GCC_FMT_ATTR(4, 0)
    273nbd_opt_vdrop(NBDClient *client, uint32_t type, Error **errp,
    274              const char *fmt, va_list va)
    275{
    276    int ret = nbd_drop(client->ioc, client->optlen, errp);
    277
    278    client->optlen = 0;
    279    if (!ret) {
    280        ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va);
    281    }
    282    return ret;
    283}
    284
    285static int GCC_FMT_ATTR(4, 5)
    286nbd_opt_drop(NBDClient *client, uint32_t type, Error **errp,
    287             const char *fmt, ...)
    288{
    289    int ret;
    290    va_list va;
    291
    292    va_start(va, fmt);
    293    ret = nbd_opt_vdrop(client, type, errp, fmt, va);
    294    va_end(va);
    295
    296    return ret;
    297}
    298
    299static int GCC_FMT_ATTR(3, 4)
    300nbd_opt_invalid(NBDClient *client, Error **errp, const char *fmt, ...)
    301{
    302    int ret;
    303    va_list va;
    304
    305    va_start(va, fmt);
    306    ret = nbd_opt_vdrop(client, NBD_REP_ERR_INVALID, errp, fmt, va);
    307    va_end(va);
    308
    309    return ret;
    310}
    311
    312/* Read size bytes from the unparsed payload of the current option.
    313 * If @check_nul, require that no NUL bytes appear in buffer.
    314 * Return -errno on I/O error, 0 if option was completely handled by
    315 * sending a reply about inconsistent lengths, or 1 on success. */
    316static int nbd_opt_read(NBDClient *client, void *buffer, size_t size,
    317                        bool check_nul, Error **errp)
    318{
    319    if (size > client->optlen) {
    320        return nbd_opt_invalid(client, errp,
    321                               "Inconsistent lengths in option %s",
    322                               nbd_opt_lookup(client->opt));
    323    }
    324    client->optlen -= size;
    325    if (qio_channel_read_all(client->ioc, buffer, size, errp) < 0) {
    326        return -EIO;
    327    }
    328
    329    if (check_nul && strnlen(buffer, size) != size) {
    330        return nbd_opt_invalid(client, errp,
    331                               "Unexpected embedded NUL in option %s",
    332                               nbd_opt_lookup(client->opt));
    333    }
    334    return 1;
    335}
    336
    337/* Drop size bytes from the unparsed payload of the current option.
    338 * Return -errno on I/O error, 0 if option was completely handled by
    339 * sending a reply about inconsistent lengths, or 1 on success. */
    340static int nbd_opt_skip(NBDClient *client, size_t size, Error **errp)
    341{
    342    if (size > client->optlen) {
    343        return nbd_opt_invalid(client, errp,
    344                               "Inconsistent lengths in option %s",
    345                               nbd_opt_lookup(client->opt));
    346    }
    347    client->optlen -= size;
    348    return nbd_drop(client->ioc, size, errp) < 0 ? -EIO : 1;
    349}
    350
    351/* nbd_opt_read_name
    352 *
    353 * Read a string with the format:
    354 *   uint32_t len     (<= NBD_MAX_STRING_SIZE)
    355 *   len bytes string (not 0-terminated)
    356 *
    357 * On success, @name will be allocated.
    358 * If @length is non-null, it will be set to the actual string length.
    359 *
    360 * Return -errno on I/O error, 0 if option was completely handled by
    361 * sending a reply about inconsistent lengths, or 1 on success.
    362 */
    363static int nbd_opt_read_name(NBDClient *client, char **name, uint32_t *length,
    364                             Error **errp)
    365{
    366    int ret;
    367    uint32_t len;
    368    g_autofree char *local_name = NULL;
    369
    370    *name = NULL;
    371    ret = nbd_opt_read(client, &len, sizeof(len), false, errp);
    372    if (ret <= 0) {
    373        return ret;
    374    }
    375    len = cpu_to_be32(len);
    376
    377    if (len > NBD_MAX_STRING_SIZE) {
    378        return nbd_opt_invalid(client, errp,
    379                               "Invalid name length: %" PRIu32, len);
    380    }
    381
    382    local_name = g_malloc(len + 1);
    383    ret = nbd_opt_read(client, local_name, len, true, errp);
    384    if (ret <= 0) {
    385        return ret;
    386    }
    387    local_name[len] = '\0';
    388
    389    if (length) {
    390        *length = len;
    391    }
    392    *name = g_steal_pointer(&local_name);
    393
    394    return 1;
    395}
    396
    397/* Send a single NBD_REP_SERVER reply to NBD_OPT_LIST, including payload.
    398 * Return -errno on error, 0 on success. */
    399static int nbd_negotiate_send_rep_list(NBDClient *client, NBDExport *exp,
    400                                       Error **errp)
    401{
    402    ERRP_GUARD();
    403    size_t name_len, desc_len;
    404    uint32_t len;
    405    const char *name = exp->name ? exp->name : "";
    406    const char *desc = exp->description ? exp->description : "";
    407    QIOChannel *ioc = client->ioc;
    408    int ret;
    409
    410    trace_nbd_negotiate_send_rep_list(name, desc);
    411    name_len = strlen(name);
    412    desc_len = strlen(desc);
    413    assert(name_len <= NBD_MAX_STRING_SIZE && desc_len <= NBD_MAX_STRING_SIZE);
    414    len = name_len + desc_len + sizeof(len);
    415    ret = nbd_negotiate_send_rep_len(client, NBD_REP_SERVER, len, errp);
    416    if (ret < 0) {
    417        return ret;
    418    }
    419
    420    len = cpu_to_be32(name_len);
    421    if (nbd_write(ioc, &len, sizeof(len), errp) < 0) {
    422        error_prepend(errp, "write failed (name length): ");
    423        return -EINVAL;
    424    }
    425
    426    if (nbd_write(ioc, name, name_len, errp) < 0) {
    427        error_prepend(errp, "write failed (name buffer): ");
    428        return -EINVAL;
    429    }
    430
    431    if (nbd_write(ioc, desc, desc_len, errp) < 0) {
    432        error_prepend(errp, "write failed (description buffer): ");
    433        return -EINVAL;
    434    }
    435
    436    return 0;
    437}
    438
    439/* Process the NBD_OPT_LIST command, with a potential series of replies.
    440 * Return -errno on error, 0 on success. */
    441static int nbd_negotiate_handle_list(NBDClient *client, Error **errp)
    442{
    443    NBDExport *exp;
    444    assert(client->opt == NBD_OPT_LIST);
    445
    446    /* For each export, send a NBD_REP_SERVER reply. */
    447    QTAILQ_FOREACH(exp, &exports, next) {
    448        if (nbd_negotiate_send_rep_list(client, exp, errp)) {
    449            return -EINVAL;
    450        }
    451    }
    452    /* Finish with a NBD_REP_ACK. */
    453    return nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
    454}
    455
    456static void nbd_check_meta_export(NBDClient *client)
    457{
    458    if (client->exp != client->export_meta.exp) {
    459        client->export_meta.count = 0;
    460    }
    461}
    462
    463/* Send a reply to NBD_OPT_EXPORT_NAME.
    464 * Return -errno on error, 0 on success. */
    465static int nbd_negotiate_handle_export_name(NBDClient *client, bool no_zeroes,
    466                                            Error **errp)
    467{
    468    ERRP_GUARD();
    469    g_autofree char *name = NULL;
    470    char buf[NBD_REPLY_EXPORT_NAME_SIZE] = "";
    471    size_t len;
    472    int ret;
    473    uint16_t myflags;
    474
    475    /* Client sends:
    476        [20 ..  xx]   export name (length bytes)
    477       Server replies:
    478        [ 0 ..   7]   size
    479        [ 8 ..   9]   export flags
    480        [10 .. 133]   reserved     (0) [unless no_zeroes]
    481     */
    482    trace_nbd_negotiate_handle_export_name();
    483    if (client->optlen > NBD_MAX_STRING_SIZE) {
    484        error_setg(errp, "Bad length received");
    485        return -EINVAL;
    486    }
    487    name = g_malloc(client->optlen + 1);
    488    if (nbd_read(client->ioc, name, client->optlen, "export name", errp) < 0) {
    489        return -EIO;
    490    }
    491    name[client->optlen] = '\0';
    492    client->optlen = 0;
    493
    494    trace_nbd_negotiate_handle_export_name_request(name);
    495
    496    client->exp = nbd_export_find(name);
    497    if (!client->exp) {
    498        error_setg(errp, "export not found");
    499        return -EINVAL;
    500    }
    501
    502    myflags = client->exp->nbdflags;
    503    if (client->structured_reply) {
    504        myflags |= NBD_FLAG_SEND_DF;
    505    }
    506    trace_nbd_negotiate_new_style_size_flags(client->exp->size, myflags);
    507    stq_be_p(buf, client->exp->size);
    508    stw_be_p(buf + 8, myflags);
    509    len = no_zeroes ? 10 : sizeof(buf);
    510    ret = nbd_write(client->ioc, buf, len, errp);
    511    if (ret < 0) {
    512        error_prepend(errp, "write failed: ");
    513        return ret;
    514    }
    515
    516    QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
    517    blk_exp_ref(&client->exp->common);
    518    nbd_check_meta_export(client);
    519
    520    return 0;
    521}
    522
    523/* Send a single NBD_REP_INFO, with a buffer @buf of @length bytes.
    524 * The buffer does NOT include the info type prefix.
    525 * Return -errno on error, 0 if ready to send more. */
    526static int nbd_negotiate_send_info(NBDClient *client,
    527                                   uint16_t info, uint32_t length, void *buf,
    528                                   Error **errp)
    529{
    530    int rc;
    531
    532    trace_nbd_negotiate_send_info(info, nbd_info_lookup(info), length);
    533    rc = nbd_negotiate_send_rep_len(client, NBD_REP_INFO,
    534                                    sizeof(info) + length, errp);
    535    if (rc < 0) {
    536        return rc;
    537    }
    538    info = cpu_to_be16(info);
    539    if (nbd_write(client->ioc, &info, sizeof(info), errp) < 0) {
    540        return -EIO;
    541    }
    542    if (nbd_write(client->ioc, buf, length, errp) < 0) {
    543        return -EIO;
    544    }
    545    return 0;
    546}
    547
    548/* nbd_reject_length: Handle any unexpected payload.
    549 * @fatal requests that we quit talking to the client, even if we are able
    550 * to successfully send an error reply.
    551 * Return:
    552 * -errno  transmission error occurred or @fatal was requested, errp is set
    553 * 0       error message successfully sent to client, errp is not set
    554 */
    555static int nbd_reject_length(NBDClient *client, bool fatal, Error **errp)
    556{
    557    int ret;
    558
    559    assert(client->optlen);
    560    ret = nbd_opt_invalid(client, errp, "option '%s' has unexpected length",
    561                          nbd_opt_lookup(client->opt));
    562    if (fatal && !ret) {
    563        error_setg(errp, "option '%s' has unexpected length",
    564                   nbd_opt_lookup(client->opt));
    565        return -EINVAL;
    566    }
    567    return ret;
    568}
    569
    570/* Handle NBD_OPT_INFO and NBD_OPT_GO.
    571 * Return -errno on error, 0 if ready for next option, and 1 to move
    572 * into transmission phase.  */
    573static int nbd_negotiate_handle_info(NBDClient *client, Error **errp)
    574{
    575    int rc;
    576    g_autofree char *name = NULL;
    577    NBDExport *exp;
    578    uint16_t requests;
    579    uint16_t request;
    580    uint32_t namelen = 0;
    581    bool sendname = false;
    582    bool blocksize = false;
    583    uint32_t sizes[3];
    584    char buf[sizeof(uint64_t) + sizeof(uint16_t)];
    585    uint32_t check_align = 0;
    586    uint16_t myflags;
    587
    588    /* Client sends:
    589        4 bytes: L, name length (can be 0)
    590        L bytes: export name
    591        2 bytes: N, number of requests (can be 0)
    592        N * 2 bytes: N requests
    593    */
    594    rc = nbd_opt_read_name(client, &name, &namelen, errp);
    595    if (rc <= 0) {
    596        return rc;
    597    }
    598    trace_nbd_negotiate_handle_export_name_request(name);
    599
    600    rc = nbd_opt_read(client, &requests, sizeof(requests), false, errp);
    601    if (rc <= 0) {
    602        return rc;
    603    }
    604    requests = be16_to_cpu(requests);
    605    trace_nbd_negotiate_handle_info_requests(requests);
    606    while (requests--) {
    607        rc = nbd_opt_read(client, &request, sizeof(request), false, errp);
    608        if (rc <= 0) {
    609            return rc;
    610        }
    611        request = be16_to_cpu(request);
    612        trace_nbd_negotiate_handle_info_request(request,
    613                                                nbd_info_lookup(request));
    614        /* We care about NBD_INFO_NAME and NBD_INFO_BLOCK_SIZE;
    615         * everything else is either a request we don't know or
    616         * something we send regardless of request */
    617        switch (request) {
    618        case NBD_INFO_NAME:
    619            sendname = true;
    620            break;
    621        case NBD_INFO_BLOCK_SIZE:
    622            blocksize = true;
    623            break;
    624        }
    625    }
    626    if (client->optlen) {
    627        return nbd_reject_length(client, false, errp);
    628    }
    629
    630    exp = nbd_export_find(name);
    631    if (!exp) {
    632        g_autofree char *sane_name = nbd_sanitize_name(name);
    633
    634        return nbd_negotiate_send_rep_err(client, NBD_REP_ERR_UNKNOWN,
    635                                          errp, "export '%s' not present",
    636                                          sane_name);
    637    }
    638
    639    /* Don't bother sending NBD_INFO_NAME unless client requested it */
    640    if (sendname) {
    641        rc = nbd_negotiate_send_info(client, NBD_INFO_NAME, namelen, name,
    642                                     errp);
    643        if (rc < 0) {
    644            return rc;
    645        }
    646    }
    647
    648    /* Send NBD_INFO_DESCRIPTION only if available, regardless of
    649     * client request */
    650    if (exp->description) {
    651        size_t len = strlen(exp->description);
    652
    653        assert(len <= NBD_MAX_STRING_SIZE);
    654        rc = nbd_negotiate_send_info(client, NBD_INFO_DESCRIPTION,
    655                                     len, exp->description, errp);
    656        if (rc < 0) {
    657            return rc;
    658        }
    659    }
    660
    661    /* Send NBD_INFO_BLOCK_SIZE always, but tweak the minimum size
    662     * according to whether the client requested it, and according to
    663     * whether this is OPT_INFO or OPT_GO. */
    664    /* minimum - 1 for back-compat, or actual if client will obey it. */
    665    if (client->opt == NBD_OPT_INFO || blocksize) {
    666        check_align = sizes[0] = blk_get_request_alignment(exp->common.blk);
    667    } else {
    668        sizes[0] = 1;
    669    }
    670    assert(sizes[0] <= NBD_MAX_BUFFER_SIZE);
    671    /* preferred - Hard-code to 4096 for now.
    672     * TODO: is blk_bs(blk)->bl.opt_transfer appropriate? */
    673    sizes[1] = MAX(4096, sizes[0]);
    674    /* maximum - At most 32M, but smaller as appropriate. */
    675    sizes[2] = MIN(blk_get_max_transfer(exp->common.blk), NBD_MAX_BUFFER_SIZE);
    676    trace_nbd_negotiate_handle_info_block_size(sizes[0], sizes[1], sizes[2]);
    677    sizes[0] = cpu_to_be32(sizes[0]);
    678    sizes[1] = cpu_to_be32(sizes[1]);
    679    sizes[2] = cpu_to_be32(sizes[2]);
    680    rc = nbd_negotiate_send_info(client, NBD_INFO_BLOCK_SIZE,
    681                                 sizeof(sizes), sizes, errp);
    682    if (rc < 0) {
    683        return rc;
    684    }
    685
    686    /* Send NBD_INFO_EXPORT always */
    687    myflags = exp->nbdflags;
    688    if (client->structured_reply) {
    689        myflags |= NBD_FLAG_SEND_DF;
    690    }
    691    trace_nbd_negotiate_new_style_size_flags(exp->size, myflags);
    692    stq_be_p(buf, exp->size);
    693    stw_be_p(buf + 8, myflags);
    694    rc = nbd_negotiate_send_info(client, NBD_INFO_EXPORT,
    695                                 sizeof(buf), buf, errp);
    696    if (rc < 0) {
    697        return rc;
    698    }
    699
    700    /*
    701     * If the client is just asking for NBD_OPT_INFO, but forgot to
    702     * request block sizes in a situation that would impact
    703     * performance, then return an error. But for NBD_OPT_GO, we
    704     * tolerate all clients, regardless of alignments.
    705     */
    706    if (client->opt == NBD_OPT_INFO && !blocksize &&
    707        blk_get_request_alignment(exp->common.blk) > 1) {
    708        return nbd_negotiate_send_rep_err(client,
    709                                          NBD_REP_ERR_BLOCK_SIZE_REQD,
    710                                          errp,
    711                                          "request NBD_INFO_BLOCK_SIZE to "
    712                                          "use this export");
    713    }
    714
    715    /* Final reply */
    716    rc = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
    717    if (rc < 0) {
    718        return rc;
    719    }
    720
    721    if (client->opt == NBD_OPT_GO) {
    722        client->exp = exp;
    723        client->check_align = check_align;
    724        QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
    725        blk_exp_ref(&client->exp->common);
    726        nbd_check_meta_export(client);
    727        rc = 1;
    728    }
    729    return rc;
    730}
    731
    732
    733/* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the
    734 * new channel for all further (now-encrypted) communication. */
    735static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
    736                                                 Error **errp)
    737{
    738    QIOChannel *ioc;
    739    QIOChannelTLS *tioc;
    740    struct NBDTLSHandshakeData data = { 0 };
    741
    742    assert(client->opt == NBD_OPT_STARTTLS);
    743
    744    trace_nbd_negotiate_handle_starttls();
    745    ioc = client->ioc;
    746
    747    if (nbd_negotiate_send_rep(client, NBD_REP_ACK, errp) < 0) {
    748        return NULL;
    749    }
    750
    751    tioc = qio_channel_tls_new_server(ioc,
    752                                      client->tlscreds,
    753                                      client->tlsauthz,
    754                                      errp);
    755    if (!tioc) {
    756        return NULL;
    757    }
    758
    759    qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-server-tls");
    760    trace_nbd_negotiate_handle_starttls_handshake();
    761    data.loop = g_main_loop_new(g_main_context_default(), FALSE);
    762    qio_channel_tls_handshake(tioc,
    763                              nbd_tls_handshake,
    764                              &data,
    765                              NULL,
    766                              NULL);
    767
    768    if (!data.complete) {
    769        g_main_loop_run(data.loop);
    770    }
    771    g_main_loop_unref(data.loop);
    772    if (data.error) {
    773        object_unref(OBJECT(tioc));
    774        error_propagate(errp, data.error);
    775        return NULL;
    776    }
    777
    778    return QIO_CHANNEL(tioc);
    779}
    780
    781/* nbd_negotiate_send_meta_context
    782 *
    783 * Send one chunk of reply to NBD_OPT_{LIST,SET}_META_CONTEXT
    784 *
    785 * For NBD_OPT_LIST_META_CONTEXT @context_id is ignored, 0 is used instead.
    786 */
    787static int nbd_negotiate_send_meta_context(NBDClient *client,
    788                                           const char *context,
    789                                           uint32_t context_id,
    790                                           Error **errp)
    791{
    792    NBDOptionReplyMetaContext opt;
    793    struct iovec iov[] = {
    794        {.iov_base = &opt, .iov_len = sizeof(opt)},
    795        {.iov_base = (void *)context, .iov_len = strlen(context)}
    796    };
    797
    798    assert(iov[1].iov_len <= NBD_MAX_STRING_SIZE);
    799    if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
    800        context_id = 0;
    801    }
    802
    803    trace_nbd_negotiate_meta_query_reply(context, context_id);
    804    set_be_option_rep(&opt.h, client->opt, NBD_REP_META_CONTEXT,
    805                      sizeof(opt) - sizeof(opt.h) + iov[1].iov_len);
    806    stl_be_p(&opt.context_id, context_id);
    807
    808    return qio_channel_writev_all(client->ioc, iov, 2, errp) < 0 ? -EIO : 0;
    809}
    810
    811/*
    812 * Return true if @query matches @pattern, or if @query is empty when
    813 * the @client is performing _LIST_.
    814 */
    815static bool nbd_meta_empty_or_pattern(NBDClient *client, const char *pattern,
    816                                      const char *query)
    817{
    818    if (!*query) {
    819        trace_nbd_negotiate_meta_query_parse("empty");
    820        return client->opt == NBD_OPT_LIST_META_CONTEXT;
    821    }
    822    if (strcmp(query, pattern) == 0) {
    823        trace_nbd_negotiate_meta_query_parse(pattern);
    824        return true;
    825    }
    826    trace_nbd_negotiate_meta_query_skip("pattern not matched");
    827    return false;
    828}
    829
    830/*
    831 * Return true and adjust @str in place if it begins with @prefix.
    832 */
    833static bool nbd_strshift(const char **str, const char *prefix)
    834{
    835    size_t len = strlen(prefix);
    836
    837    if (strncmp(*str, prefix, len) == 0) {
    838        *str += len;
    839        return true;
    840    }
    841    return false;
    842}
    843
    844/* nbd_meta_base_query
    845 *
    846 * Handle queries to 'base' namespace. For now, only the base:allocation
    847 * context is available.  Return true if @query has been handled.
    848 */
    849static bool nbd_meta_base_query(NBDClient *client, NBDExportMetaContexts *meta,
    850                                const char *query)
    851{
    852    if (!nbd_strshift(&query, "base:")) {
    853        return false;
    854    }
    855    trace_nbd_negotiate_meta_query_parse("base:");
    856
    857    if (nbd_meta_empty_or_pattern(client, "allocation", query)) {
    858        meta->base_allocation = true;
    859    }
    860    return true;
    861}
    862
    863/* nbd_meta_qemu_query
    864 *
    865 * Handle queries to 'qemu' namespace. For now, only the qemu:dirty-bitmap:
    866 * and qemu:allocation-depth contexts are available.  Return true if @query
    867 * has been handled.
    868 */
    869static bool nbd_meta_qemu_query(NBDClient *client, NBDExportMetaContexts *meta,
    870                                const char *query)
    871{
    872    size_t i;
    873
    874    if (!nbd_strshift(&query, "qemu:")) {
    875        return false;
    876    }
    877    trace_nbd_negotiate_meta_query_parse("qemu:");
    878
    879    if (!*query) {
    880        if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
    881            meta->allocation_depth = meta->exp->allocation_depth;
    882            memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
    883        }
    884        trace_nbd_negotiate_meta_query_parse("empty");
    885        return true;
    886    }
    887
    888    if (strcmp(query, "allocation-depth") == 0) {
    889        trace_nbd_negotiate_meta_query_parse("allocation-depth");
    890        meta->allocation_depth = meta->exp->allocation_depth;
    891        return true;
    892    }
    893
    894    if (nbd_strshift(&query, "dirty-bitmap:")) {
    895        trace_nbd_negotiate_meta_query_parse("dirty-bitmap:");
    896        if (!*query) {
    897            if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
    898                memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
    899            }
    900            trace_nbd_negotiate_meta_query_parse("empty");
    901            return true;
    902        }
    903
    904        for (i = 0; i < meta->exp->nr_export_bitmaps; i++) {
    905            const char *bm_name;
    906
    907            bm_name = bdrv_dirty_bitmap_name(meta->exp->export_bitmaps[i]);
    908            if (strcmp(bm_name, query) == 0) {
    909                meta->bitmaps[i] = true;
    910                trace_nbd_negotiate_meta_query_parse(query);
    911                return true;
    912            }
    913        }
    914        trace_nbd_negotiate_meta_query_skip("no dirty-bitmap match");
    915        return true;
    916    }
    917
    918    trace_nbd_negotiate_meta_query_skip("unknown qemu context");
    919    return true;
    920}
    921
    922/* nbd_negotiate_meta_query
    923 *
    924 * Parse namespace name and call corresponding function to parse body of the
    925 * query.
    926 *
    927 * The only supported namespaces are 'base' and 'qemu'.
    928 *
    929 * Return -errno on I/O error, 0 if option was completely handled by
    930 * sending a reply about inconsistent lengths, or 1 on success. */
    931static int nbd_negotiate_meta_query(NBDClient *client,
    932                                    NBDExportMetaContexts *meta, Error **errp)
    933{
    934    int ret;
    935    g_autofree char *query = NULL;
    936    uint32_t len;
    937
    938    ret = nbd_opt_read(client, &len, sizeof(len), false, errp);
    939    if (ret <= 0) {
    940        return ret;
    941    }
    942    len = cpu_to_be32(len);
    943
    944    if (len > NBD_MAX_STRING_SIZE) {
    945        trace_nbd_negotiate_meta_query_skip("length too long");
    946        return nbd_opt_skip(client, len, errp);
    947    }
    948
    949    query = g_malloc(len + 1);
    950    ret = nbd_opt_read(client, query, len, true, errp);
    951    if (ret <= 0) {
    952        return ret;
    953    }
    954    query[len] = '\0';
    955
    956    if (nbd_meta_base_query(client, meta, query)) {
    957        return 1;
    958    }
    959    if (nbd_meta_qemu_query(client, meta, query)) {
    960        return 1;
    961    }
    962
    963    trace_nbd_negotiate_meta_query_skip("unknown namespace");
    964    return 1;
    965}
    966
    967/* nbd_negotiate_meta_queries
    968 * Handle NBD_OPT_LIST_META_CONTEXT and NBD_OPT_SET_META_CONTEXT
    969 *
    970 * Return -errno on I/O error, or 0 if option was completely handled. */
    971static int nbd_negotiate_meta_queries(NBDClient *client,
    972                                      NBDExportMetaContexts *meta, Error **errp)
    973{
    974    int ret;
    975    g_autofree char *export_name = NULL;
    976    /* Mark unused to work around https://bugs.llvm.org/show_bug.cgi?id=3888 */
    977    g_autofree G_GNUC_UNUSED bool *bitmaps = NULL;
    978    NBDExportMetaContexts local_meta = {0};
    979    uint32_t nb_queries;
    980    size_t i;
    981    size_t count = 0;
    982
    983    if (client->opt == NBD_OPT_SET_META_CONTEXT && !client->structured_reply) {
    984        return nbd_opt_invalid(client, errp,
    985                               "request option '%s' when structured reply "
    986                               "is not negotiated",
    987                               nbd_opt_lookup(client->opt));
    988    }
    989
    990    if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
    991        /* Only change the caller's meta on SET. */
    992        meta = &local_meta;
    993    }
    994
    995    g_free(meta->bitmaps);
    996    memset(meta, 0, sizeof(*meta));
    997
    998    ret = nbd_opt_read_name(client, &export_name, NULL, errp);
    999    if (ret <= 0) {
   1000        return ret;
   1001    }
   1002
   1003    meta->exp = nbd_export_find(export_name);
   1004    if (meta->exp == NULL) {
   1005        g_autofree char *sane_name = nbd_sanitize_name(export_name);
   1006
   1007        return nbd_opt_drop(client, NBD_REP_ERR_UNKNOWN, errp,
   1008                            "export '%s' not present", sane_name);
   1009    }
   1010    meta->bitmaps = g_new0(bool, meta->exp->nr_export_bitmaps);
   1011    if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
   1012        bitmaps = meta->bitmaps;
   1013    }
   1014
   1015    ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), false, errp);
   1016    if (ret <= 0) {
   1017        return ret;
   1018    }
   1019    nb_queries = cpu_to_be32(nb_queries);
   1020    trace_nbd_negotiate_meta_context(nbd_opt_lookup(client->opt),
   1021                                     export_name, nb_queries);
   1022
   1023    if (client->opt == NBD_OPT_LIST_META_CONTEXT && !nb_queries) {
   1024        /* enable all known contexts */
   1025        meta->base_allocation = true;
   1026        meta->allocation_depth = meta->exp->allocation_depth;
   1027        memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
   1028    } else {
   1029        for (i = 0; i < nb_queries; ++i) {
   1030            ret = nbd_negotiate_meta_query(client, meta, errp);
   1031            if (ret <= 0) {
   1032                return ret;
   1033            }
   1034        }
   1035    }
   1036
   1037    if (meta->base_allocation) {
   1038        ret = nbd_negotiate_send_meta_context(client, "base:allocation",
   1039                                              NBD_META_ID_BASE_ALLOCATION,
   1040                                              errp);
   1041        if (ret < 0) {
   1042            return ret;
   1043        }
   1044        count++;
   1045    }
   1046
   1047    if (meta->allocation_depth) {
   1048        ret = nbd_negotiate_send_meta_context(client, "qemu:allocation-depth",
   1049                                              NBD_META_ID_ALLOCATION_DEPTH,
   1050                                              errp);
   1051        if (ret < 0) {
   1052            return ret;
   1053        }
   1054        count++;
   1055    }
   1056
   1057    for (i = 0; i < meta->exp->nr_export_bitmaps; i++) {
   1058        const char *bm_name;
   1059        g_autofree char *context = NULL;
   1060
   1061        if (!meta->bitmaps[i]) {
   1062            continue;
   1063        }
   1064
   1065        bm_name = bdrv_dirty_bitmap_name(meta->exp->export_bitmaps[i]);
   1066        context = g_strdup_printf("qemu:dirty-bitmap:%s", bm_name);
   1067
   1068        ret = nbd_negotiate_send_meta_context(client, context,
   1069                                              NBD_META_ID_DIRTY_BITMAP + i,
   1070                                              errp);
   1071        if (ret < 0) {
   1072            return ret;
   1073        }
   1074        count++;
   1075    }
   1076
   1077    ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
   1078    if (ret == 0) {
   1079        meta->count = count;
   1080    }
   1081
   1082    return ret;
   1083}
   1084
   1085/* nbd_negotiate_options
   1086 * Process all NBD_OPT_* client option commands, during fixed newstyle
   1087 * negotiation.
   1088 * Return:
   1089 * -errno  on error, errp is set
   1090 * 0       on successful negotiation, errp is not set
   1091 * 1       if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
   1092 *         errp is not set
   1093 */
   1094static int nbd_negotiate_options(NBDClient *client, Error **errp)
   1095{
   1096    uint32_t flags;
   1097    bool fixedNewstyle = false;
   1098    bool no_zeroes = false;
   1099
   1100    /* Client sends:
   1101        [ 0 ..   3]   client flags
   1102
   1103       Then we loop until NBD_OPT_EXPORT_NAME or NBD_OPT_GO:
   1104        [ 0 ..   7]   NBD_OPTS_MAGIC
   1105        [ 8 ..  11]   NBD option
   1106        [12 ..  15]   Data length
   1107        ...           Rest of request
   1108
   1109        [ 0 ..   7]   NBD_OPTS_MAGIC
   1110        [ 8 ..  11]   Second NBD option
   1111        [12 ..  15]   Data length
   1112        ...           Rest of request
   1113    */
   1114
   1115    if (nbd_read32(client->ioc, &flags, "flags", errp) < 0) {
   1116        return -EIO;
   1117    }
   1118    trace_nbd_negotiate_options_flags(flags);
   1119    if (flags & NBD_FLAG_C_FIXED_NEWSTYLE) {
   1120        fixedNewstyle = true;
   1121        flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE;
   1122    }
   1123    if (flags & NBD_FLAG_C_NO_ZEROES) {
   1124        no_zeroes = true;
   1125        flags &= ~NBD_FLAG_C_NO_ZEROES;
   1126    }
   1127    if (flags != 0) {
   1128        error_setg(errp, "Unknown client flags 0x%" PRIx32 " received", flags);
   1129        return -EINVAL;
   1130    }
   1131
   1132    while (1) {
   1133        int ret;
   1134        uint32_t option, length;
   1135        uint64_t magic;
   1136
   1137        if (nbd_read64(client->ioc, &magic, "opts magic", errp) < 0) {
   1138            return -EINVAL;
   1139        }
   1140        trace_nbd_negotiate_options_check_magic(magic);
   1141        if (magic != NBD_OPTS_MAGIC) {
   1142            error_setg(errp, "Bad magic received");
   1143            return -EINVAL;
   1144        }
   1145
   1146        if (nbd_read32(client->ioc, &option, "option", errp) < 0) {
   1147            return -EINVAL;
   1148        }
   1149        client->opt = option;
   1150
   1151        if (nbd_read32(client->ioc, &length, "option length", errp) < 0) {
   1152            return -EINVAL;
   1153        }
   1154        assert(!client->optlen);
   1155        client->optlen = length;
   1156
   1157        if (length > NBD_MAX_BUFFER_SIZE) {
   1158            error_setg(errp, "len (%" PRIu32" ) is larger than max len (%u)",
   1159                       length, NBD_MAX_BUFFER_SIZE);
   1160            return -EINVAL;
   1161        }
   1162
   1163        trace_nbd_negotiate_options_check_option(option,
   1164                                                 nbd_opt_lookup(option));
   1165        if (client->tlscreds &&
   1166            client->ioc == (QIOChannel *)client->sioc) {
   1167            QIOChannel *tioc;
   1168            if (!fixedNewstyle) {
   1169                error_setg(errp, "Unsupported option 0x%" PRIx32, option);
   1170                return -EINVAL;
   1171            }
   1172            switch (option) {
   1173            case NBD_OPT_STARTTLS:
   1174                if (length) {
   1175                    /* Unconditionally drop the connection if the client
   1176                     * can't start a TLS negotiation correctly */
   1177                    return nbd_reject_length(client, true, errp);
   1178                }
   1179                tioc = nbd_negotiate_handle_starttls(client, errp);
   1180                if (!tioc) {
   1181                    return -EIO;
   1182                }
   1183                ret = 0;
   1184                object_unref(OBJECT(client->ioc));
   1185                client->ioc = QIO_CHANNEL(tioc);
   1186                break;
   1187
   1188            case NBD_OPT_EXPORT_NAME:
   1189                /* No way to return an error to client, so drop connection */
   1190                error_setg(errp, "Option 0x%x not permitted before TLS",
   1191                           option);
   1192                return -EINVAL;
   1193
   1194            default:
   1195                /* Let the client keep trying, unless they asked to
   1196                 * quit. Always try to give an error back to the
   1197                 * client; but when replying to OPT_ABORT, be aware
   1198                 * that the client may hang up before receiving the
   1199                 * error, in which case we are fine ignoring the
   1200                 * resulting EPIPE. */
   1201                ret = nbd_opt_drop(client, NBD_REP_ERR_TLS_REQD,
   1202                                   option == NBD_OPT_ABORT ? NULL : errp,
   1203                                   "Option 0x%" PRIx32
   1204                                   " not permitted before TLS", option);
   1205                if (option == NBD_OPT_ABORT) {
   1206                    return 1;
   1207                }
   1208                break;
   1209            }
   1210        } else if (fixedNewstyle) {
   1211            switch (option) {
   1212            case NBD_OPT_LIST:
   1213                if (length) {
   1214                    ret = nbd_reject_length(client, false, errp);
   1215                } else {
   1216                    ret = nbd_negotiate_handle_list(client, errp);
   1217                }
   1218                break;
   1219
   1220            case NBD_OPT_ABORT:
   1221                /* NBD spec says we must try to reply before
   1222                 * disconnecting, but that we must also tolerate
   1223                 * guests that don't wait for our reply. */
   1224                nbd_negotiate_send_rep(client, NBD_REP_ACK, NULL);
   1225                return 1;
   1226
   1227            case NBD_OPT_EXPORT_NAME:
   1228                return nbd_negotiate_handle_export_name(client, no_zeroes,
   1229                                                        errp);
   1230
   1231            case NBD_OPT_INFO:
   1232            case NBD_OPT_GO:
   1233                ret = nbd_negotiate_handle_info(client, errp);
   1234                if (ret == 1) {
   1235                    assert(option == NBD_OPT_GO);
   1236                    return 0;
   1237                }
   1238                break;
   1239
   1240            case NBD_OPT_STARTTLS:
   1241                if (length) {
   1242                    ret = nbd_reject_length(client, false, errp);
   1243                } else if (client->tlscreds) {
   1244                    ret = nbd_negotiate_send_rep_err(client,
   1245                                                     NBD_REP_ERR_INVALID, errp,
   1246                                                     "TLS already enabled");
   1247                } else {
   1248                    ret = nbd_negotiate_send_rep_err(client,
   1249                                                     NBD_REP_ERR_POLICY, errp,
   1250                                                     "TLS not configured");
   1251                }
   1252                break;
   1253
   1254            case NBD_OPT_STRUCTURED_REPLY:
   1255                if (length) {
   1256                    ret = nbd_reject_length(client, false, errp);
   1257                } else if (client->structured_reply) {
   1258                    ret = nbd_negotiate_send_rep_err(
   1259                        client, NBD_REP_ERR_INVALID, errp,
   1260                        "structured reply already negotiated");
   1261                } else {
   1262                    ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
   1263                    client->structured_reply = true;
   1264                }
   1265                break;
   1266
   1267            case NBD_OPT_LIST_META_CONTEXT:
   1268            case NBD_OPT_SET_META_CONTEXT:
   1269                ret = nbd_negotiate_meta_queries(client, &client->export_meta,
   1270                                                 errp);
   1271                break;
   1272
   1273            default:
   1274                ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp,
   1275                                   "Unsupported option %" PRIu32 " (%s)",
   1276                                   option, nbd_opt_lookup(option));
   1277                break;
   1278            }
   1279        } else {
   1280            /*
   1281             * If broken new-style we should drop the connection
   1282             * for anything except NBD_OPT_EXPORT_NAME
   1283             */
   1284            switch (option) {
   1285            case NBD_OPT_EXPORT_NAME:
   1286                return nbd_negotiate_handle_export_name(client, no_zeroes,
   1287                                                        errp);
   1288
   1289            default:
   1290                error_setg(errp, "Unsupported option %" PRIu32 " (%s)",
   1291                           option, nbd_opt_lookup(option));
   1292                return -EINVAL;
   1293            }
   1294        }
   1295        if (ret < 0) {
   1296            return ret;
   1297        }
   1298    }
   1299}
   1300
   1301/* nbd_negotiate
   1302 * Return:
   1303 * -errno  on error, errp is set
   1304 * 0       on successful negotiation, errp is not set
   1305 * 1       if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
   1306 *         errp is not set
   1307 */
   1308static coroutine_fn int nbd_negotiate(NBDClient *client, Error **errp)
   1309{
   1310    ERRP_GUARD();
   1311    char buf[NBD_OLDSTYLE_NEGOTIATE_SIZE] = "";
   1312    int ret;
   1313
   1314    /* Old style negotiation header, no room for options
   1315        [ 0 ..   7]   passwd       ("NBDMAGIC")
   1316        [ 8 ..  15]   magic        (NBD_CLIENT_MAGIC)
   1317        [16 ..  23]   size
   1318        [24 ..  27]   export flags (zero-extended)
   1319        [28 .. 151]   reserved     (0)
   1320
   1321       New style negotiation header, client can send options
   1322        [ 0 ..   7]   passwd       ("NBDMAGIC")
   1323        [ 8 ..  15]   magic        (NBD_OPTS_MAGIC)
   1324        [16 ..  17]   server flags (0)
   1325        ....options sent, ending in NBD_OPT_EXPORT_NAME or NBD_OPT_GO....
   1326     */
   1327
   1328    qio_channel_set_blocking(client->ioc, false, NULL);
   1329
   1330    trace_nbd_negotiate_begin();
   1331    memcpy(buf, "NBDMAGIC", 8);
   1332
   1333    stq_be_p(buf + 8, NBD_OPTS_MAGIC);
   1334    stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES);
   1335
   1336    if (nbd_write(client->ioc, buf, 18, errp) < 0) {
   1337        error_prepend(errp, "write failed: ");
   1338        return -EINVAL;
   1339    }
   1340    ret = nbd_negotiate_options(client, errp);
   1341    if (ret != 0) {
   1342        if (ret < 0) {
   1343            error_prepend(errp, "option negotiation failed: ");
   1344        }
   1345        return ret;
   1346    }
   1347
   1348    /* Attach the channel to the same AioContext as the export */
   1349    if (client->exp && client->exp->common.ctx) {
   1350        qio_channel_attach_aio_context(client->ioc, client->exp->common.ctx);
   1351    }
   1352
   1353    assert(!client->optlen);
   1354    trace_nbd_negotiate_success();
   1355
   1356    return 0;
   1357}
   1358
   1359/* nbd_read_eof
   1360 * Tries to read @size bytes from @ioc. This is a local implementation of
   1361 * qio_channel_readv_all_eof. We have it here because we need it to be
   1362 * interruptible and to know when the coroutine is yielding.
   1363 * Returns 1 on success
   1364 *         0 on eof, when no data was read (errp is not set)
   1365 *         negative errno on failure (errp is set)
   1366 */
   1367static inline int coroutine_fn
   1368nbd_read_eof(NBDClient *client, void *buffer, size_t size, Error **errp)
   1369{
   1370    bool partial = false;
   1371
   1372    assert(size);
   1373    while (size > 0) {
   1374        struct iovec iov = { .iov_base = buffer, .iov_len = size };
   1375        ssize_t len;
   1376
   1377        len = qio_channel_readv(client->ioc, &iov, 1, errp);
   1378        if (len == QIO_CHANNEL_ERR_BLOCK) {
   1379            client->read_yielding = true;
   1380            qio_channel_yield(client->ioc, G_IO_IN);
   1381            client->read_yielding = false;
   1382            if (client->quiescing) {
   1383                return -EAGAIN;
   1384            }
   1385            continue;
   1386        } else if (len < 0) {
   1387            return -EIO;
   1388        } else if (len == 0) {
   1389            if (partial) {
   1390                error_setg(errp,
   1391                           "Unexpected end-of-file before all bytes were read");
   1392                return -EIO;
   1393            } else {
   1394                return 0;
   1395            }
   1396        }
   1397
   1398        partial = true;
   1399        size -= len;
   1400        buffer = (uint8_t *) buffer + len;
   1401    }
   1402    return 1;
   1403}
   1404
   1405static int nbd_receive_request(NBDClient *client, NBDRequest *request,
   1406                               Error **errp)
   1407{
   1408    uint8_t buf[NBD_REQUEST_SIZE];
   1409    uint32_t magic;
   1410    int ret;
   1411
   1412    ret = nbd_read_eof(client, buf, sizeof(buf), errp);
   1413    if (ret < 0) {
   1414        return ret;
   1415    }
   1416
   1417    /* Request
   1418       [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
   1419       [ 4 ..  5]   flags   (NBD_CMD_FLAG_FUA, ...)
   1420       [ 6 ..  7]   type    (NBD_CMD_READ, ...)
   1421       [ 8 .. 15]   handle
   1422       [16 .. 23]   from
   1423       [24 .. 27]   len
   1424     */
   1425
   1426    magic = ldl_be_p(buf);
   1427    request->flags  = lduw_be_p(buf + 4);
   1428    request->type   = lduw_be_p(buf + 6);
   1429    request->handle = ldq_be_p(buf + 8);
   1430    request->from   = ldq_be_p(buf + 16);
   1431    request->len    = ldl_be_p(buf + 24);
   1432
   1433    trace_nbd_receive_request(magic, request->flags, request->type,
   1434                              request->from, request->len);
   1435
   1436    if (magic != NBD_REQUEST_MAGIC) {
   1437        error_setg(errp, "invalid magic (got 0x%" PRIx32 ")", magic);
   1438        return -EINVAL;
   1439    }
   1440    return 0;
   1441}
   1442
   1443#define MAX_NBD_REQUESTS 16
   1444
   1445void nbd_client_get(NBDClient *client)
   1446{
   1447    client->refcount++;
   1448}
   1449
   1450void nbd_client_put(NBDClient *client)
   1451{
   1452    if (--client->refcount == 0) {
   1453        /* The last reference should be dropped by client->close,
   1454         * which is called by client_close.
   1455         */
   1456        assert(client->closing);
   1457
   1458        qio_channel_detach_aio_context(client->ioc);
   1459        object_unref(OBJECT(client->sioc));
   1460        object_unref(OBJECT(client->ioc));
   1461        if (client->tlscreds) {
   1462            object_unref(OBJECT(client->tlscreds));
   1463        }
   1464        g_free(client->tlsauthz);
   1465        if (client->exp) {
   1466            QTAILQ_REMOVE(&client->exp->clients, client, next);
   1467            blk_exp_unref(&client->exp->common);
   1468        }
   1469        g_free(client->export_meta.bitmaps);
   1470        g_free(client);
   1471    }
   1472}
   1473
   1474static void client_close(NBDClient *client, bool negotiated)
   1475{
   1476    if (client->closing) {
   1477        return;
   1478    }
   1479
   1480    client->closing = true;
   1481
   1482    /* Force requests to finish.  They will drop their own references,
   1483     * then we'll close the socket and free the NBDClient.
   1484     */
   1485    qio_channel_shutdown(client->ioc, QIO_CHANNEL_SHUTDOWN_BOTH,
   1486                         NULL);
   1487
   1488    /* Also tell the client, so that they release their reference.  */
   1489    if (client->close_fn) {
   1490        client->close_fn(client, negotiated);
   1491    }
   1492}
   1493
   1494static NBDRequestData *nbd_request_get(NBDClient *client)
   1495{
   1496    NBDRequestData *req;
   1497
   1498    assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
   1499    client->nb_requests++;
   1500
   1501    req = g_new0(NBDRequestData, 1);
   1502    nbd_client_get(client);
   1503    req->client = client;
   1504    return req;
   1505}
   1506
   1507static void nbd_request_put(NBDRequestData *req)
   1508{
   1509    NBDClient *client = req->client;
   1510
   1511    if (req->data) {
   1512        qemu_vfree(req->data);
   1513    }
   1514    g_free(req);
   1515
   1516    client->nb_requests--;
   1517
   1518    if (client->quiescing && client->nb_requests == 0) {
   1519        aio_wait_kick();
   1520    }
   1521
   1522    nbd_client_receive_next_request(client);
   1523
   1524    nbd_client_put(client);
   1525}
   1526
   1527static void blk_aio_attached(AioContext *ctx, void *opaque)
   1528{
   1529    NBDExport *exp = opaque;
   1530    NBDClient *client;
   1531
   1532    trace_nbd_blk_aio_attached(exp->name, ctx);
   1533
   1534    exp->common.ctx = ctx;
   1535
   1536    QTAILQ_FOREACH(client, &exp->clients, next) {
   1537        qio_channel_attach_aio_context(client->ioc, ctx);
   1538
   1539        assert(client->nb_requests == 0);
   1540        assert(client->recv_coroutine == NULL);
   1541        assert(client->send_coroutine == NULL);
   1542    }
   1543}
   1544
   1545static void blk_aio_detach(void *opaque)
   1546{
   1547    NBDExport *exp = opaque;
   1548    NBDClient *client;
   1549
   1550    trace_nbd_blk_aio_detach(exp->name, exp->common.ctx);
   1551
   1552    QTAILQ_FOREACH(client, &exp->clients, next) {
   1553        qio_channel_detach_aio_context(client->ioc);
   1554    }
   1555
   1556    exp->common.ctx = NULL;
   1557}
   1558
   1559static void nbd_drained_begin(void *opaque)
   1560{
   1561    NBDExport *exp = opaque;
   1562    NBDClient *client;
   1563
   1564    QTAILQ_FOREACH(client, &exp->clients, next) {
   1565        client->quiescing = true;
   1566    }
   1567}
   1568
   1569static void nbd_drained_end(void *opaque)
   1570{
   1571    NBDExport *exp = opaque;
   1572    NBDClient *client;
   1573
   1574    QTAILQ_FOREACH(client, &exp->clients, next) {
   1575        client->quiescing = false;
   1576        nbd_client_receive_next_request(client);
   1577    }
   1578}
   1579
   1580static bool nbd_drained_poll(void *opaque)
   1581{
   1582    NBDExport *exp = opaque;
   1583    NBDClient *client;
   1584
   1585    QTAILQ_FOREACH(client, &exp->clients, next) {
   1586        if (client->nb_requests != 0) {
   1587            /*
   1588             * If there's a coroutine waiting for a request on nbd_read_eof()
   1589             * enter it here so we don't depend on the client to wake it up.
   1590             */
   1591            if (client->recv_coroutine != NULL && client->read_yielding) {
   1592                qemu_aio_coroutine_enter(exp->common.ctx,
   1593                                         client->recv_coroutine);
   1594            }
   1595
   1596            return true;
   1597        }
   1598    }
   1599
   1600    return false;
   1601}
   1602
   1603static void nbd_eject_notifier(Notifier *n, void *data)
   1604{
   1605    NBDExport *exp = container_of(n, NBDExport, eject_notifier);
   1606
   1607    blk_exp_request_shutdown(&exp->common);
   1608}
   1609
   1610void nbd_export_set_on_eject_blk(BlockExport *exp, BlockBackend *blk)
   1611{
   1612    NBDExport *nbd_exp = container_of(exp, NBDExport, common);
   1613    assert(exp->drv == &blk_exp_nbd);
   1614    assert(nbd_exp->eject_notifier_blk == NULL);
   1615
   1616    blk_ref(blk);
   1617    nbd_exp->eject_notifier_blk = blk;
   1618    nbd_exp->eject_notifier.notify = nbd_eject_notifier;
   1619    blk_add_remove_bs_notifier(blk, &nbd_exp->eject_notifier);
   1620}
   1621
   1622static const BlockDevOps nbd_block_ops = {
   1623    .drained_begin = nbd_drained_begin,
   1624    .drained_end = nbd_drained_end,
   1625    .drained_poll = nbd_drained_poll,
   1626};
   1627
   1628static int nbd_export_create(BlockExport *blk_exp, BlockExportOptions *exp_args,
   1629                             Error **errp)
   1630{
   1631    NBDExport *exp = container_of(blk_exp, NBDExport, common);
   1632    BlockExportOptionsNbd *arg = &exp_args->u.nbd;
   1633    BlockBackend *blk = blk_exp->blk;
   1634    int64_t size;
   1635    uint64_t perm, shared_perm;
   1636    bool readonly = !exp_args->writable;
   1637    bool shared = !exp_args->writable;
   1638    strList *bitmaps;
   1639    size_t i;
   1640    int ret;
   1641
   1642    assert(exp_args->type == BLOCK_EXPORT_TYPE_NBD);
   1643
   1644    if (!nbd_server_is_running()) {
   1645        error_setg(errp, "NBD server not running");
   1646        return -EINVAL;
   1647    }
   1648
   1649    if (!arg->has_name) {
   1650        arg->name = exp_args->node_name;
   1651    }
   1652
   1653    if (strlen(arg->name) > NBD_MAX_STRING_SIZE) {
   1654        error_setg(errp, "export name '%s' too long", arg->name);
   1655        return -EINVAL;
   1656    }
   1657
   1658    if (arg->description && strlen(arg->description) > NBD_MAX_STRING_SIZE) {
   1659        error_setg(errp, "description '%s' too long", arg->description);
   1660        return -EINVAL;
   1661    }
   1662
   1663    if (nbd_export_find(arg->name)) {
   1664        error_setg(errp, "NBD server already has export named '%s'", arg->name);
   1665        return -EEXIST;
   1666    }
   1667
   1668    size = blk_getlength(blk);
   1669    if (size < 0) {
   1670        error_setg_errno(errp, -size,
   1671                         "Failed to determine the NBD export's length");
   1672        return size;
   1673    }
   1674
   1675    /* Don't allow resize while the NBD server is running, otherwise we don't
   1676     * care what happens with the node. */
   1677    blk_get_perm(blk, &perm, &shared_perm);
   1678    ret = blk_set_perm(blk, perm, shared_perm & ~BLK_PERM_RESIZE, errp);
   1679    if (ret < 0) {
   1680        return ret;
   1681    }
   1682
   1683    QTAILQ_INIT(&exp->clients);
   1684    exp->name = g_strdup(arg->name);
   1685    exp->description = g_strdup(arg->description);
   1686    exp->nbdflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_FLUSH |
   1687                     NBD_FLAG_SEND_FUA | NBD_FLAG_SEND_CACHE);
   1688    if (readonly) {
   1689        exp->nbdflags |= NBD_FLAG_READ_ONLY;
   1690        if (shared) {
   1691            exp->nbdflags |= NBD_FLAG_CAN_MULTI_CONN;
   1692        }
   1693    } else {
   1694        exp->nbdflags |= (NBD_FLAG_SEND_TRIM | NBD_FLAG_SEND_WRITE_ZEROES |
   1695                          NBD_FLAG_SEND_FAST_ZERO);
   1696    }
   1697    exp->size = QEMU_ALIGN_DOWN(size, BDRV_SECTOR_SIZE);
   1698
   1699    for (bitmaps = arg->bitmaps; bitmaps; bitmaps = bitmaps->next) {
   1700        exp->nr_export_bitmaps++;
   1701    }
   1702    exp->export_bitmaps = g_new0(BdrvDirtyBitmap *, exp->nr_export_bitmaps);
   1703    for (i = 0, bitmaps = arg->bitmaps; bitmaps;
   1704         i++, bitmaps = bitmaps->next) {
   1705        const char *bitmap = bitmaps->value;
   1706        BlockDriverState *bs = blk_bs(blk);
   1707        BdrvDirtyBitmap *bm = NULL;
   1708
   1709        while (bs) {
   1710            bm = bdrv_find_dirty_bitmap(bs, bitmap);
   1711            if (bm != NULL) {
   1712                break;
   1713            }
   1714
   1715            bs = bdrv_filter_or_cow_bs(bs);
   1716        }
   1717
   1718        if (bm == NULL) {
   1719            ret = -ENOENT;
   1720            error_setg(errp, "Bitmap '%s' is not found", bitmap);
   1721            goto fail;
   1722        }
   1723
   1724        if (bdrv_dirty_bitmap_check(bm, BDRV_BITMAP_ALLOW_RO, errp)) {
   1725            ret = -EINVAL;
   1726            goto fail;
   1727        }
   1728
   1729        if (readonly && bdrv_is_writable(bs) &&
   1730            bdrv_dirty_bitmap_enabled(bm)) {
   1731            ret = -EINVAL;
   1732            error_setg(errp,
   1733                       "Enabled bitmap '%s' incompatible with readonly export",
   1734                       bitmap);
   1735            goto fail;
   1736        }
   1737
   1738        exp->export_bitmaps[i] = bm;
   1739        assert(strlen(bitmap) <= BDRV_BITMAP_MAX_NAME_SIZE);
   1740    }
   1741
   1742    /* Mark bitmaps busy in a separate loop, to simplify roll-back concerns. */
   1743    for (i = 0; i < exp->nr_export_bitmaps; i++) {
   1744        bdrv_dirty_bitmap_set_busy(exp->export_bitmaps[i], true);
   1745    }
   1746
   1747    exp->allocation_depth = arg->allocation_depth;
   1748
   1749    /*
   1750     * We need to inhibit request queuing in the block layer to ensure we can
   1751     * be properly quiesced when entering a drained section, as our coroutines
   1752     * servicing pending requests might enter blk_pread().
   1753     */
   1754    blk_set_disable_request_queuing(blk, true);
   1755
   1756    blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
   1757
   1758    blk_set_dev_ops(blk, &nbd_block_ops, exp);
   1759
   1760    QTAILQ_INSERT_TAIL(&exports, exp, next);
   1761
   1762    return 0;
   1763
   1764fail:
   1765    g_free(exp->export_bitmaps);
   1766    g_free(exp->name);
   1767    g_free(exp->description);
   1768    return ret;
   1769}
   1770
   1771NBDExport *nbd_export_find(const char *name)
   1772{
   1773    NBDExport *exp;
   1774    QTAILQ_FOREACH(exp, &exports, next) {
   1775        if (strcmp(name, exp->name) == 0) {
   1776            return exp;
   1777        }
   1778    }
   1779
   1780    return NULL;
   1781}
   1782
   1783AioContext *
   1784nbd_export_aio_context(NBDExport *exp)
   1785{
   1786    return exp->common.ctx;
   1787}
   1788
   1789static void nbd_export_request_shutdown(BlockExport *blk_exp)
   1790{
   1791    NBDExport *exp = container_of(blk_exp, NBDExport, common);
   1792    NBDClient *client, *next;
   1793
   1794    blk_exp_ref(&exp->common);
   1795    /*
   1796     * TODO: Should we expand QMP NbdServerRemoveNode enum to allow a
   1797     * close mode that stops advertising the export to new clients but
   1798     * still permits existing clients to run to completion? Because of
   1799     * that possibility, nbd_export_close() can be called more than
   1800     * once on an export.
   1801     */
   1802    QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
   1803        client_close(client, true);
   1804    }
   1805    if (exp->name) {
   1806        g_free(exp->name);
   1807        exp->name = NULL;
   1808        QTAILQ_REMOVE(&exports, exp, next);
   1809    }
   1810    blk_exp_unref(&exp->common);
   1811}
   1812
   1813static void nbd_export_delete(BlockExport *blk_exp)
   1814{
   1815    size_t i;
   1816    NBDExport *exp = container_of(blk_exp, NBDExport, common);
   1817
   1818    assert(exp->name == NULL);
   1819    assert(QTAILQ_EMPTY(&exp->clients));
   1820
   1821    g_free(exp->description);
   1822    exp->description = NULL;
   1823
   1824    if (exp->common.blk) {
   1825        if (exp->eject_notifier_blk) {
   1826            notifier_remove(&exp->eject_notifier);
   1827            blk_unref(exp->eject_notifier_blk);
   1828        }
   1829        blk_remove_aio_context_notifier(exp->common.blk, blk_aio_attached,
   1830                                        blk_aio_detach, exp);
   1831        blk_set_disable_request_queuing(exp->common.blk, false);
   1832    }
   1833
   1834    for (i = 0; i < exp->nr_export_bitmaps; i++) {
   1835        bdrv_dirty_bitmap_set_busy(exp->export_bitmaps[i], false);
   1836    }
   1837}
   1838
   1839const BlockExportDriver blk_exp_nbd = {
   1840    .type               = BLOCK_EXPORT_TYPE_NBD,
   1841    .instance_size      = sizeof(NBDExport),
   1842    .create             = nbd_export_create,
   1843    .delete             = nbd_export_delete,
   1844    .request_shutdown   = nbd_export_request_shutdown,
   1845};
   1846
   1847static int coroutine_fn nbd_co_send_iov(NBDClient *client, struct iovec *iov,
   1848                                        unsigned niov, Error **errp)
   1849{
   1850    int ret;
   1851
   1852    g_assert(qemu_in_coroutine());
   1853    qemu_co_mutex_lock(&client->send_lock);
   1854    client->send_coroutine = qemu_coroutine_self();
   1855
   1856    ret = qio_channel_writev_all(client->ioc, iov, niov, errp) < 0 ? -EIO : 0;
   1857
   1858    client->send_coroutine = NULL;
   1859    qemu_co_mutex_unlock(&client->send_lock);
   1860
   1861    return ret;
   1862}
   1863
   1864static inline void set_be_simple_reply(NBDSimpleReply *reply, uint64_t error,
   1865                                       uint64_t handle)
   1866{
   1867    stl_be_p(&reply->magic, NBD_SIMPLE_REPLY_MAGIC);
   1868    stl_be_p(&reply->error, error);
   1869    stq_be_p(&reply->handle, handle);
   1870}
   1871
   1872static int nbd_co_send_simple_reply(NBDClient *client,
   1873                                    uint64_t handle,
   1874                                    uint32_t error,
   1875                                    void *data,
   1876                                    size_t len,
   1877                                    Error **errp)
   1878{
   1879    NBDSimpleReply reply;
   1880    int nbd_err = system_errno_to_nbd_errno(error);
   1881    struct iovec iov[] = {
   1882        {.iov_base = &reply, .iov_len = sizeof(reply)},
   1883        {.iov_base = data, .iov_len = len}
   1884    };
   1885
   1886    trace_nbd_co_send_simple_reply(handle, nbd_err, nbd_err_lookup(nbd_err),
   1887                                   len);
   1888    set_be_simple_reply(&reply, nbd_err, handle);
   1889
   1890    return nbd_co_send_iov(client, iov, len ? 2 : 1, errp);
   1891}
   1892
   1893static inline void set_be_chunk(NBDStructuredReplyChunk *chunk, uint16_t flags,
   1894                                uint16_t type, uint64_t handle, uint32_t length)
   1895{
   1896    stl_be_p(&chunk->magic, NBD_STRUCTURED_REPLY_MAGIC);
   1897    stw_be_p(&chunk->flags, flags);
   1898    stw_be_p(&chunk->type, type);
   1899    stq_be_p(&chunk->handle, handle);
   1900    stl_be_p(&chunk->length, length);
   1901}
   1902
   1903static int coroutine_fn nbd_co_send_structured_done(NBDClient *client,
   1904                                                    uint64_t handle,
   1905                                                    Error **errp)
   1906{
   1907    NBDStructuredReplyChunk chunk;
   1908    struct iovec iov[] = {
   1909        {.iov_base = &chunk, .iov_len = sizeof(chunk)},
   1910    };
   1911
   1912    trace_nbd_co_send_structured_done(handle);
   1913    set_be_chunk(&chunk, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_NONE, handle, 0);
   1914
   1915    return nbd_co_send_iov(client, iov, 1, errp);
   1916}
   1917
   1918static int coroutine_fn nbd_co_send_structured_read(NBDClient *client,
   1919                                                    uint64_t handle,
   1920                                                    uint64_t offset,
   1921                                                    void *data,
   1922                                                    size_t size,
   1923                                                    bool final,
   1924                                                    Error **errp)
   1925{
   1926    NBDStructuredReadData chunk;
   1927    struct iovec iov[] = {
   1928        {.iov_base = &chunk, .iov_len = sizeof(chunk)},
   1929        {.iov_base = data, .iov_len = size}
   1930    };
   1931
   1932    assert(size);
   1933    trace_nbd_co_send_structured_read(handle, offset, data, size);
   1934    set_be_chunk(&chunk.h, final ? NBD_REPLY_FLAG_DONE : 0,
   1935                 NBD_REPLY_TYPE_OFFSET_DATA, handle,
   1936                 sizeof(chunk) - sizeof(chunk.h) + size);
   1937    stq_be_p(&chunk.offset, offset);
   1938
   1939    return nbd_co_send_iov(client, iov, 2, errp);
   1940}
   1941
   1942static int coroutine_fn nbd_co_send_structured_error(NBDClient *client,
   1943                                                     uint64_t handle,
   1944                                                     uint32_t error,
   1945                                                     const char *msg,
   1946                                                     Error **errp)
   1947{
   1948    NBDStructuredError chunk;
   1949    int nbd_err = system_errno_to_nbd_errno(error);
   1950    struct iovec iov[] = {
   1951        {.iov_base = &chunk, .iov_len = sizeof(chunk)},
   1952        {.iov_base = (char *)msg, .iov_len = msg ? strlen(msg) : 0},
   1953    };
   1954
   1955    assert(nbd_err);
   1956    trace_nbd_co_send_structured_error(handle, nbd_err,
   1957                                       nbd_err_lookup(nbd_err), msg ? msg : "");
   1958    set_be_chunk(&chunk.h, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_ERROR, handle,
   1959                 sizeof(chunk) - sizeof(chunk.h) + iov[1].iov_len);
   1960    stl_be_p(&chunk.error, nbd_err);
   1961    stw_be_p(&chunk.message_length, iov[1].iov_len);
   1962
   1963    return nbd_co_send_iov(client, iov, 1 + !!iov[1].iov_len, errp);
   1964}
   1965
   1966/* Do a sparse read and send the structured reply to the client.
   1967 * Returns -errno if sending fails. bdrv_block_status_above() failure is
   1968 * reported to the client, at which point this function succeeds.
   1969 */
   1970static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client,
   1971                                                uint64_t handle,
   1972                                                uint64_t offset,
   1973                                                uint8_t *data,
   1974                                                size_t size,
   1975                                                Error **errp)
   1976{
   1977    int ret = 0;
   1978    NBDExport *exp = client->exp;
   1979    size_t progress = 0;
   1980
   1981    while (progress < size) {
   1982        int64_t pnum;
   1983        int status = bdrv_block_status_above(blk_bs(exp->common.blk), NULL,
   1984                                             offset + progress,
   1985                                             size - progress, &pnum, NULL,
   1986                                             NULL);
   1987        bool final;
   1988
   1989        if (status < 0) {
   1990            char *msg = g_strdup_printf("unable to check for holes: %s",
   1991                                        strerror(-status));
   1992
   1993            ret = nbd_co_send_structured_error(client, handle, -status, msg,
   1994                                               errp);
   1995            g_free(msg);
   1996            return ret;
   1997        }
   1998        assert(pnum && pnum <= size - progress);
   1999        final = progress + pnum == size;
   2000        if (status & BDRV_BLOCK_ZERO) {
   2001            NBDStructuredReadHole chunk;
   2002            struct iovec iov[] = {
   2003                {.iov_base = &chunk, .iov_len = sizeof(chunk)},
   2004            };
   2005
   2006            trace_nbd_co_send_structured_read_hole(handle, offset + progress,
   2007                                                   pnum);
   2008            set_be_chunk(&chunk.h, final ? NBD_REPLY_FLAG_DONE : 0,
   2009                         NBD_REPLY_TYPE_OFFSET_HOLE,
   2010                         handle, sizeof(chunk) - sizeof(chunk.h));
   2011            stq_be_p(&chunk.offset, offset + progress);
   2012            stl_be_p(&chunk.length, pnum);
   2013            ret = nbd_co_send_iov(client, iov, 1, errp);
   2014        } else {
   2015            ret = blk_pread(exp->common.blk, offset + progress,
   2016                            data + progress, pnum);
   2017            if (ret < 0) {
   2018                error_setg_errno(errp, -ret, "reading from file failed");
   2019                break;
   2020            }
   2021            ret = nbd_co_send_structured_read(client, handle, offset + progress,
   2022                                              data + progress, pnum, final,
   2023                                              errp);
   2024        }
   2025
   2026        if (ret < 0) {
   2027            break;
   2028        }
   2029        progress += pnum;
   2030    }
   2031    return ret;
   2032}
   2033
   2034typedef struct NBDExtentArray {
   2035    NBDExtent *extents;
   2036    unsigned int nb_alloc;
   2037    unsigned int count;
   2038    uint64_t total_length;
   2039    bool can_add;
   2040    bool converted_to_be;
   2041} NBDExtentArray;
   2042
   2043static NBDExtentArray *nbd_extent_array_new(unsigned int nb_alloc)
   2044{
   2045    NBDExtentArray *ea = g_new0(NBDExtentArray, 1);
   2046
   2047    ea->nb_alloc = nb_alloc;
   2048    ea->extents = g_new(NBDExtent, nb_alloc);
   2049    ea->can_add = true;
   2050
   2051    return ea;
   2052}
   2053
   2054static void nbd_extent_array_free(NBDExtentArray *ea)
   2055{
   2056    g_free(ea->extents);
   2057    g_free(ea);
   2058}
   2059G_DEFINE_AUTOPTR_CLEANUP_FUNC(NBDExtentArray, nbd_extent_array_free);
   2060
   2061/* Further modifications of the array after conversion are abandoned */
   2062static void nbd_extent_array_convert_to_be(NBDExtentArray *ea)
   2063{
   2064    int i;
   2065
   2066    assert(!ea->converted_to_be);
   2067    ea->can_add = false;
   2068    ea->converted_to_be = true;
   2069
   2070    for (i = 0; i < ea->count; i++) {
   2071        ea->extents[i].flags = cpu_to_be32(ea->extents[i].flags);
   2072        ea->extents[i].length = cpu_to_be32(ea->extents[i].length);
   2073    }
   2074}
   2075
   2076/*
   2077 * Add extent to NBDExtentArray. If extent can't be added (no available space),
   2078 * return -1.
   2079 * For safety, when returning -1 for the first time, .can_add is set to false,
   2080 * further call to nbd_extent_array_add() will crash.
   2081 * (to avoid the situation, when after failing to add an extent (returned -1),
   2082 * user miss this failure and add another extent, which is successfully added
   2083 * (array is full, but new extent may be squashed into the last one), then we
   2084 * have invalid array with skipped extent)
   2085 */
   2086static int nbd_extent_array_add(NBDExtentArray *ea,
   2087                                uint32_t length, uint32_t flags)
   2088{
   2089    assert(ea->can_add);
   2090
   2091    if (!length) {
   2092        return 0;
   2093    }
   2094
   2095    /* Extend previous extent if flags are the same */
   2096    if (ea->count > 0 && flags == ea->extents[ea->count - 1].flags) {
   2097        uint64_t sum = (uint64_t)length + ea->extents[ea->count - 1].length;
   2098
   2099        if (sum <= UINT32_MAX) {
   2100            ea->extents[ea->count - 1].length = sum;
   2101            ea->total_length += length;
   2102            return 0;
   2103        }
   2104    }
   2105
   2106    if (ea->count >= ea->nb_alloc) {
   2107        ea->can_add = false;
   2108        return -1;
   2109    }
   2110
   2111    ea->total_length += length;
   2112    ea->extents[ea->count] = (NBDExtent) {.length = length, .flags = flags};
   2113    ea->count++;
   2114
   2115    return 0;
   2116}
   2117
   2118static int blockstatus_to_extents(BlockDriverState *bs, uint64_t offset,
   2119                                  uint64_t bytes, NBDExtentArray *ea)
   2120{
   2121    while (bytes) {
   2122        uint32_t flags;
   2123        int64_t num;
   2124        int ret = bdrv_block_status_above(bs, NULL, offset, bytes, &num,
   2125                                          NULL, NULL);
   2126
   2127        if (ret < 0) {
   2128            return ret;
   2129        }
   2130
   2131        flags = (ret & BDRV_BLOCK_DATA ? 0 : NBD_STATE_HOLE) |
   2132                (ret & BDRV_BLOCK_ZERO ? NBD_STATE_ZERO : 0);
   2133
   2134        if (nbd_extent_array_add(ea, num, flags) < 0) {
   2135            return 0;
   2136        }
   2137
   2138        offset += num;
   2139        bytes -= num;
   2140    }
   2141
   2142    return 0;
   2143}
   2144
   2145static int blockalloc_to_extents(BlockDriverState *bs, uint64_t offset,
   2146                                 uint64_t bytes, NBDExtentArray *ea)
   2147{
   2148    while (bytes) {
   2149        int64_t num;
   2150        int ret = bdrv_is_allocated_above(bs, NULL, false, offset, bytes,
   2151                                          &num);
   2152
   2153        if (ret < 0) {
   2154            return ret;
   2155        }
   2156
   2157        if (nbd_extent_array_add(ea, num, ret) < 0) {
   2158            return 0;
   2159        }
   2160
   2161        offset += num;
   2162        bytes -= num;
   2163    }
   2164
   2165    return 0;
   2166}
   2167
   2168/*
   2169 * nbd_co_send_extents
   2170 *
   2171 * @ea is converted to BE by the function
   2172 * @last controls whether NBD_REPLY_FLAG_DONE is sent.
   2173 */
   2174static int nbd_co_send_extents(NBDClient *client, uint64_t handle,
   2175                               NBDExtentArray *ea,
   2176                               bool last, uint32_t context_id, Error **errp)
   2177{
   2178    NBDStructuredMeta chunk;
   2179    struct iovec iov[] = {
   2180        {.iov_base = &chunk, .iov_len = sizeof(chunk)},
   2181        {.iov_base = ea->extents, .iov_len = ea->count * sizeof(ea->extents[0])}
   2182    };
   2183
   2184    nbd_extent_array_convert_to_be(ea);
   2185
   2186    trace_nbd_co_send_extents(handle, ea->count, context_id, ea->total_length,
   2187                              last);
   2188    set_be_chunk(&chunk.h, last ? NBD_REPLY_FLAG_DONE : 0,
   2189                 NBD_REPLY_TYPE_BLOCK_STATUS,
   2190                 handle, sizeof(chunk) - sizeof(chunk.h) + iov[1].iov_len);
   2191    stl_be_p(&chunk.context_id, context_id);
   2192
   2193    return nbd_co_send_iov(client, iov, 2, errp);
   2194}
   2195
   2196/* Get block status from the exported device and send it to the client */
   2197static int nbd_co_send_block_status(NBDClient *client, uint64_t handle,
   2198                                    BlockDriverState *bs, uint64_t offset,
   2199                                    uint32_t length, bool dont_fragment,
   2200                                    bool last, uint32_t context_id,
   2201                                    Error **errp)
   2202{
   2203    int ret;
   2204    unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
   2205    g_autoptr(NBDExtentArray) ea = nbd_extent_array_new(nb_extents);
   2206
   2207    if (context_id == NBD_META_ID_BASE_ALLOCATION) {
   2208        ret = blockstatus_to_extents(bs, offset, length, ea);
   2209    } else {
   2210        ret = blockalloc_to_extents(bs, offset, length, ea);
   2211    }
   2212    if (ret < 0) {
   2213        return nbd_co_send_structured_error(
   2214                client, handle, -ret, "can't get block status", errp);
   2215    }
   2216
   2217    return nbd_co_send_extents(client, handle, ea, last, context_id, errp);
   2218}
   2219
   2220/* Populate @ea from a dirty bitmap. */
   2221static void bitmap_to_extents(BdrvDirtyBitmap *bitmap,
   2222                              uint64_t offset, uint64_t length,
   2223                              NBDExtentArray *es)
   2224{
   2225    int64_t start, dirty_start, dirty_count;
   2226    int64_t end = offset + length;
   2227    bool full = false;
   2228
   2229    bdrv_dirty_bitmap_lock(bitmap);
   2230
   2231    for (start = offset;
   2232         bdrv_dirty_bitmap_next_dirty_area(bitmap, start, end, INT32_MAX,
   2233                                           &dirty_start, &dirty_count);
   2234         start = dirty_start + dirty_count)
   2235    {
   2236        if ((nbd_extent_array_add(es, dirty_start - start, 0) < 0) ||
   2237            (nbd_extent_array_add(es, dirty_count, NBD_STATE_DIRTY) < 0))
   2238        {
   2239            full = true;
   2240            break;
   2241        }
   2242    }
   2243
   2244    if (!full) {
   2245        /* last non dirty extent, nothing to do if array is now full */
   2246        (void) nbd_extent_array_add(es, end - start, 0);
   2247    }
   2248
   2249    bdrv_dirty_bitmap_unlock(bitmap);
   2250}
   2251
   2252static int nbd_co_send_bitmap(NBDClient *client, uint64_t handle,
   2253                              BdrvDirtyBitmap *bitmap, uint64_t offset,
   2254                              uint32_t length, bool dont_fragment, bool last,
   2255                              uint32_t context_id, Error **errp)
   2256{
   2257    unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
   2258    g_autoptr(NBDExtentArray) ea = nbd_extent_array_new(nb_extents);
   2259
   2260    bitmap_to_extents(bitmap, offset, length, ea);
   2261
   2262    return nbd_co_send_extents(client, handle, ea, last, context_id, errp);
   2263}
   2264
   2265/* nbd_co_receive_request
   2266 * Collect a client request. Return 0 if request looks valid, -EIO to drop
   2267 * connection right away, -EAGAIN to indicate we were interrupted and the
   2268 * channel should be quiesced, and any other negative value to report an error
   2269 * to the client (although the caller may still need to disconnect after
   2270 * reporting the error).
   2271 */
   2272static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request,
   2273                                  Error **errp)
   2274{
   2275    NBDClient *client = req->client;
   2276    int valid_flags;
   2277    int ret;
   2278
   2279    g_assert(qemu_in_coroutine());
   2280    assert(client->recv_coroutine == qemu_coroutine_self());
   2281    ret = nbd_receive_request(client, request, errp);
   2282    if (ret < 0) {
   2283        return  ret;
   2284    }
   2285
   2286    trace_nbd_co_receive_request_decode_type(request->handle, request->type,
   2287                                             nbd_cmd_lookup(request->type));
   2288
   2289    if (request->type != NBD_CMD_WRITE) {
   2290        /* No payload, we are ready to read the next request.  */
   2291        req->complete = true;
   2292    }
   2293
   2294    if (request->type == NBD_CMD_DISC) {
   2295        /* Special case: we're going to disconnect without a reply,
   2296         * whether or not flags, from, or len are bogus */
   2297        return -EIO;
   2298    }
   2299
   2300    if (request->type == NBD_CMD_READ || request->type == NBD_CMD_WRITE ||
   2301        request->type == NBD_CMD_CACHE)
   2302    {
   2303        if (request->len > NBD_MAX_BUFFER_SIZE) {
   2304            error_setg(errp, "len (%" PRIu32" ) is larger than max len (%u)",
   2305                       request->len, NBD_MAX_BUFFER_SIZE);
   2306            return -EINVAL;
   2307        }
   2308
   2309        if (request->type != NBD_CMD_CACHE) {
   2310            req->data = blk_try_blockalign(client->exp->common.blk,
   2311                                           request->len);
   2312            if (req->data == NULL) {
   2313                error_setg(errp, "No memory");
   2314                return -ENOMEM;
   2315            }
   2316        }
   2317    }
   2318
   2319    if (request->type == NBD_CMD_WRITE) {
   2320        if (nbd_read(client->ioc, req->data, request->len, "CMD_WRITE data",
   2321                     errp) < 0)
   2322        {
   2323            return -EIO;
   2324        }
   2325        req->complete = true;
   2326
   2327        trace_nbd_co_receive_request_payload_received(request->handle,
   2328                                                      request->len);
   2329    }
   2330
   2331    /* Sanity checks. */
   2332    if (client->exp->nbdflags & NBD_FLAG_READ_ONLY &&
   2333        (request->type == NBD_CMD_WRITE ||
   2334         request->type == NBD_CMD_WRITE_ZEROES ||
   2335         request->type == NBD_CMD_TRIM)) {
   2336        error_setg(errp, "Export is read-only");
   2337        return -EROFS;
   2338    }
   2339    if (request->from > client->exp->size ||
   2340        request->len > client->exp->size - request->from) {
   2341        error_setg(errp, "operation past EOF; From: %" PRIu64 ", Len: %" PRIu32
   2342                   ", Size: %" PRIu64, request->from, request->len,
   2343                   client->exp->size);
   2344        return (request->type == NBD_CMD_WRITE ||
   2345                request->type == NBD_CMD_WRITE_ZEROES) ? -ENOSPC : -EINVAL;
   2346    }
   2347    if (client->check_align && !QEMU_IS_ALIGNED(request->from | request->len,
   2348                                                client->check_align)) {
   2349        /*
   2350         * The block layer gracefully handles unaligned requests, but
   2351         * it's still worth tracing client non-compliance
   2352         */
   2353        trace_nbd_co_receive_align_compliance(nbd_cmd_lookup(request->type),
   2354                                              request->from,
   2355                                              request->len,
   2356                                              client->check_align);
   2357    }
   2358    valid_flags = NBD_CMD_FLAG_FUA;
   2359    if (request->type == NBD_CMD_READ && client->structured_reply) {
   2360        valid_flags |= NBD_CMD_FLAG_DF;
   2361    } else if (request->type == NBD_CMD_WRITE_ZEROES) {
   2362        valid_flags |= NBD_CMD_FLAG_NO_HOLE | NBD_CMD_FLAG_FAST_ZERO;
   2363    } else if (request->type == NBD_CMD_BLOCK_STATUS) {
   2364        valid_flags |= NBD_CMD_FLAG_REQ_ONE;
   2365    }
   2366    if (request->flags & ~valid_flags) {
   2367        error_setg(errp, "unsupported flags for command %s (got 0x%x)",
   2368                   nbd_cmd_lookup(request->type), request->flags);
   2369        return -EINVAL;
   2370    }
   2371
   2372    return 0;
   2373}
   2374
   2375/* Send simple reply without a payload, or a structured error
   2376 * @error_msg is ignored if @ret >= 0
   2377 * Returns 0 if connection is still live, -errno on failure to talk to client
   2378 */
   2379static coroutine_fn int nbd_send_generic_reply(NBDClient *client,
   2380                                               uint64_t handle,
   2381                                               int ret,
   2382                                               const char *error_msg,
   2383                                               Error **errp)
   2384{
   2385    if (client->structured_reply && ret < 0) {
   2386        return nbd_co_send_structured_error(client, handle, -ret, error_msg,
   2387                                            errp);
   2388    } else {
   2389        return nbd_co_send_simple_reply(client, handle, ret < 0 ? -ret : 0,
   2390                                        NULL, 0, errp);
   2391    }
   2392}
   2393
   2394/* Handle NBD_CMD_READ request.
   2395 * Return -errno if sending fails. Other errors are reported directly to the
   2396 * client as an error reply. */
   2397static coroutine_fn int nbd_do_cmd_read(NBDClient *client, NBDRequest *request,
   2398                                        uint8_t *data, Error **errp)
   2399{
   2400    int ret;
   2401    NBDExport *exp = client->exp;
   2402
   2403    assert(request->type == NBD_CMD_READ);
   2404
   2405    /* XXX: NBD Protocol only documents use of FUA with WRITE */
   2406    if (request->flags & NBD_CMD_FLAG_FUA) {
   2407        ret = blk_co_flush(exp->common.blk);
   2408        if (ret < 0) {
   2409            return nbd_send_generic_reply(client, request->handle, ret,
   2410                                          "flush failed", errp);
   2411        }
   2412    }
   2413
   2414    if (client->structured_reply && !(request->flags & NBD_CMD_FLAG_DF) &&
   2415        request->len)
   2416    {
   2417        return nbd_co_send_sparse_read(client, request->handle, request->from,
   2418                                       data, request->len, errp);
   2419    }
   2420
   2421    ret = blk_pread(exp->common.blk, request->from, data, request->len);
   2422    if (ret < 0) {
   2423        return nbd_send_generic_reply(client, request->handle, ret,
   2424                                      "reading from file failed", errp);
   2425    }
   2426
   2427    if (client->structured_reply) {
   2428        if (request->len) {
   2429            return nbd_co_send_structured_read(client, request->handle,
   2430                                               request->from, data,
   2431                                               request->len, true, errp);
   2432        } else {
   2433            return nbd_co_send_structured_done(client, request->handle, errp);
   2434        }
   2435    } else {
   2436        return nbd_co_send_simple_reply(client, request->handle, 0,
   2437                                        data, request->len, errp);
   2438    }
   2439}
   2440
   2441/*
   2442 * nbd_do_cmd_cache
   2443 *
   2444 * Handle NBD_CMD_CACHE request.
   2445 * Return -errno if sending fails. Other errors are reported directly to the
   2446 * client as an error reply.
   2447 */
   2448static coroutine_fn int nbd_do_cmd_cache(NBDClient *client, NBDRequest *request,
   2449                                         Error **errp)
   2450{
   2451    int ret;
   2452    NBDExport *exp = client->exp;
   2453
   2454    assert(request->type == NBD_CMD_CACHE);
   2455
   2456    ret = blk_co_preadv(exp->common.blk, request->from, request->len,
   2457                        NULL, BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH);
   2458
   2459    return nbd_send_generic_reply(client, request->handle, ret,
   2460                                  "caching data failed", errp);
   2461}
   2462
   2463/* Handle NBD request.
   2464 * Return -errno if sending fails. Other errors are reported directly to the
   2465 * client as an error reply. */
   2466static coroutine_fn int nbd_handle_request(NBDClient *client,
   2467                                           NBDRequest *request,
   2468                                           uint8_t *data, Error **errp)
   2469{
   2470    int ret;
   2471    int flags;
   2472    NBDExport *exp = client->exp;
   2473    char *msg;
   2474    size_t i;
   2475
   2476    switch (request->type) {
   2477    case NBD_CMD_CACHE:
   2478        return nbd_do_cmd_cache(client, request, errp);
   2479
   2480    case NBD_CMD_READ:
   2481        return nbd_do_cmd_read(client, request, data, errp);
   2482
   2483    case NBD_CMD_WRITE:
   2484        flags = 0;
   2485        if (request->flags & NBD_CMD_FLAG_FUA) {
   2486            flags |= BDRV_REQ_FUA;
   2487        }
   2488        ret = blk_pwrite(exp->common.blk, request->from, data, request->len,
   2489                         flags);
   2490        return nbd_send_generic_reply(client, request->handle, ret,
   2491                                      "writing to file failed", errp);
   2492
   2493    case NBD_CMD_WRITE_ZEROES:
   2494        flags = 0;
   2495        if (request->flags & NBD_CMD_FLAG_FUA) {
   2496            flags |= BDRV_REQ_FUA;
   2497        }
   2498        if (!(request->flags & NBD_CMD_FLAG_NO_HOLE)) {
   2499            flags |= BDRV_REQ_MAY_UNMAP;
   2500        }
   2501        if (request->flags & NBD_CMD_FLAG_FAST_ZERO) {
   2502            flags |= BDRV_REQ_NO_FALLBACK;
   2503        }
   2504        ret = 0;
   2505        /* FIXME simplify this when blk_pwrite_zeroes switches to 64-bit */
   2506        while (ret >= 0 && request->len) {
   2507            int align = client->check_align ?: 1;
   2508            int len = MIN(request->len, QEMU_ALIGN_DOWN(BDRV_REQUEST_MAX_BYTES,
   2509                                                        align));
   2510            ret = blk_pwrite_zeroes(exp->common.blk, request->from, len, flags);
   2511            request->len -= len;
   2512            request->from += len;
   2513        }
   2514        return nbd_send_generic_reply(client, request->handle, ret,
   2515                                      "writing to file failed", errp);
   2516
   2517    case NBD_CMD_DISC:
   2518        /* unreachable, thanks to special case in nbd_co_receive_request() */
   2519        abort();
   2520
   2521    case NBD_CMD_FLUSH:
   2522        ret = blk_co_flush(exp->common.blk);
   2523        return nbd_send_generic_reply(client, request->handle, ret,
   2524                                      "flush failed", errp);
   2525
   2526    case NBD_CMD_TRIM:
   2527        ret = 0;
   2528        /* FIXME simplify this when blk_co_pdiscard switches to 64-bit */
   2529        while (ret >= 0 && request->len) {
   2530            int align = client->check_align ?: 1;
   2531            int len = MIN(request->len, QEMU_ALIGN_DOWN(BDRV_REQUEST_MAX_BYTES,
   2532                                                        align));
   2533            ret = blk_co_pdiscard(exp->common.blk, request->from, len);
   2534            request->len -= len;
   2535            request->from += len;
   2536        }
   2537        if (ret >= 0 && request->flags & NBD_CMD_FLAG_FUA) {
   2538            ret = blk_co_flush(exp->common.blk);
   2539        }
   2540        return nbd_send_generic_reply(client, request->handle, ret,
   2541                                      "discard failed", errp);
   2542
   2543    case NBD_CMD_BLOCK_STATUS:
   2544        if (!request->len) {
   2545            return nbd_send_generic_reply(client, request->handle, -EINVAL,
   2546                                          "need non-zero length", errp);
   2547        }
   2548        if (client->export_meta.count) {
   2549            bool dont_fragment = request->flags & NBD_CMD_FLAG_REQ_ONE;
   2550            int contexts_remaining = client->export_meta.count;
   2551
   2552            if (client->export_meta.base_allocation) {
   2553                ret = nbd_co_send_block_status(client, request->handle,
   2554                                               blk_bs(exp->common.blk),
   2555                                               request->from,
   2556                                               request->len, dont_fragment,
   2557                                               !--contexts_remaining,
   2558                                               NBD_META_ID_BASE_ALLOCATION,
   2559                                               errp);
   2560                if (ret < 0) {
   2561                    return ret;
   2562                }
   2563            }
   2564
   2565            if (client->export_meta.allocation_depth) {
   2566                ret = nbd_co_send_block_status(client, request->handle,
   2567                                               blk_bs(exp->common.blk),
   2568                                               request->from, request->len,
   2569                                               dont_fragment,
   2570                                               !--contexts_remaining,
   2571                                               NBD_META_ID_ALLOCATION_DEPTH,
   2572                                               errp);
   2573                if (ret < 0) {
   2574                    return ret;
   2575                }
   2576            }
   2577
   2578            for (i = 0; i < client->exp->nr_export_bitmaps; i++) {
   2579                if (!client->export_meta.bitmaps[i]) {
   2580                    continue;
   2581                }
   2582                ret = nbd_co_send_bitmap(client, request->handle,
   2583                                         client->exp->export_bitmaps[i],
   2584                                         request->from, request->len,
   2585                                         dont_fragment, !--contexts_remaining,
   2586                                         NBD_META_ID_DIRTY_BITMAP + i, errp);
   2587                if (ret < 0) {
   2588                    return ret;
   2589                }
   2590            }
   2591
   2592            assert(!contexts_remaining);
   2593
   2594            return 0;
   2595        } else {
   2596            return nbd_send_generic_reply(client, request->handle, -EINVAL,
   2597                                          "CMD_BLOCK_STATUS not negotiated",
   2598                                          errp);
   2599        }
   2600
   2601    default:
   2602        msg = g_strdup_printf("invalid request type (%" PRIu32 ") received",
   2603                              request->type);
   2604        ret = nbd_send_generic_reply(client, request->handle, -EINVAL, msg,
   2605                                     errp);
   2606        g_free(msg);
   2607        return ret;
   2608    }
   2609}
   2610
   2611/* Owns a reference to the NBDClient passed as opaque.  */
   2612static coroutine_fn void nbd_trip(void *opaque)
   2613{
   2614    NBDClient *client = opaque;
   2615    NBDRequestData *req;
   2616    NBDRequest request = { 0 };    /* GCC thinks it can be used uninitialized */
   2617    int ret;
   2618    Error *local_err = NULL;
   2619
   2620    trace_nbd_trip();
   2621    if (client->closing) {
   2622        nbd_client_put(client);
   2623        return;
   2624    }
   2625
   2626    if (client->quiescing) {
   2627        /*
   2628         * We're switching between AIO contexts. Don't attempt to receive a new
   2629         * request and kick the main context which may be waiting for us.
   2630         */
   2631        nbd_client_put(client);
   2632        client->recv_coroutine = NULL;
   2633        aio_wait_kick();
   2634        return;
   2635    }
   2636
   2637    req = nbd_request_get(client);
   2638    ret = nbd_co_receive_request(req, &request, &local_err);
   2639    client->recv_coroutine = NULL;
   2640
   2641    if (client->closing) {
   2642        /*
   2643         * The client may be closed when we are blocked in
   2644         * nbd_co_receive_request()
   2645         */
   2646        goto done;
   2647    }
   2648
   2649    if (ret == -EAGAIN) {
   2650        assert(client->quiescing);
   2651        goto done;
   2652    }
   2653
   2654    nbd_client_receive_next_request(client);
   2655    if (ret == -EIO) {
   2656        goto disconnect;
   2657    }
   2658
   2659    if (ret < 0) {
   2660        /* It wans't -EIO, so, according to nbd_co_receive_request()
   2661         * semantics, we should return the error to the client. */
   2662        Error *export_err = local_err;
   2663
   2664        local_err = NULL;
   2665        ret = nbd_send_generic_reply(client, request.handle, -EINVAL,
   2666                                     error_get_pretty(export_err), &local_err);
   2667        error_free(export_err);
   2668    } else {
   2669        ret = nbd_handle_request(client, &request, req->data, &local_err);
   2670    }
   2671    if (ret < 0) {
   2672        error_prepend(&local_err, "Failed to send reply: ");
   2673        goto disconnect;
   2674    }
   2675
   2676    /* We must disconnect after NBD_CMD_WRITE if we did not
   2677     * read the payload.
   2678     */
   2679    if (!req->complete) {
   2680        error_setg(&local_err, "Request handling failed in intermediate state");
   2681        goto disconnect;
   2682    }
   2683
   2684done:
   2685    nbd_request_put(req);
   2686    nbd_client_put(client);
   2687    return;
   2688
   2689disconnect:
   2690    if (local_err) {
   2691        error_reportf_err(local_err, "Disconnect client, due to: ");
   2692    }
   2693    nbd_request_put(req);
   2694    client_close(client, true);
   2695    nbd_client_put(client);
   2696}
   2697
   2698static void nbd_client_receive_next_request(NBDClient *client)
   2699{
   2700    if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS &&
   2701        !client->quiescing) {
   2702        nbd_client_get(client);
   2703        client->recv_coroutine = qemu_coroutine_create(nbd_trip, client);
   2704        aio_co_schedule(client->exp->common.ctx, client->recv_coroutine);
   2705    }
   2706}
   2707
   2708static coroutine_fn void nbd_co_client_start(void *opaque)
   2709{
   2710    NBDClient *client = opaque;
   2711    Error *local_err = NULL;
   2712
   2713    qemu_co_mutex_init(&client->send_lock);
   2714
   2715    if (nbd_negotiate(client, &local_err)) {
   2716        if (local_err) {
   2717            error_report_err(local_err);
   2718        }
   2719        client_close(client, false);
   2720        return;
   2721    }
   2722
   2723    nbd_client_receive_next_request(client);
   2724}
   2725
   2726/*
   2727 * Create a new client listener using the given channel @sioc.
   2728 * Begin servicing it in a coroutine.  When the connection closes, call
   2729 * @close_fn with an indication of whether the client completed negotiation.
   2730 */
   2731void nbd_client_new(QIOChannelSocket *sioc,
   2732                    QCryptoTLSCreds *tlscreds,
   2733                    const char *tlsauthz,
   2734                    void (*close_fn)(NBDClient *, bool))
   2735{
   2736    NBDClient *client;
   2737    Coroutine *co;
   2738
   2739    client = g_new0(NBDClient, 1);
   2740    client->refcount = 1;
   2741    client->tlscreds = tlscreds;
   2742    if (tlscreds) {
   2743        object_ref(OBJECT(client->tlscreds));
   2744    }
   2745    client->tlsauthz = g_strdup(tlsauthz);
   2746    client->sioc = sioc;
   2747    object_ref(OBJECT(client->sioc));
   2748    client->ioc = QIO_CHANNEL(sioc);
   2749    object_ref(OBJECT(client->ioc));
   2750    client->close_fn = close_fn;
   2751
   2752    co = qemu_coroutine_create(nbd_co_client_start, client);
   2753    qemu_coroutine_enter(co);
   2754}