cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

ctrl.c (192260B)


      1/*
      2 * QEMU NVM Express Controller
      3 *
      4 * Copyright (c) 2012, Intel Corporation
      5 *
      6 * Written by Keith Busch <keith.busch@intel.com>
      7 *
      8 * This code is licensed under the GNU GPL v2 or later.
      9 */
     10
     11/**
     12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
     13 *
     14 *  https://nvmexpress.org/developers/nvme-specification/
     15 *
     16 *
     17 * Notes on coding style
     18 * ---------------------
     19 * While QEMU coding style prefers lowercase hexadecimals in constants, the
     20 * NVMe subsystem use thes format from the NVMe specifications in the comments
     21 * (i.e. 'h' suffix instead of '0x' prefix).
     22 *
     23 * Usage
     24 * -----
     25 * See docs/system/nvme.rst for extensive documentation.
     26 *
     27 * Add options:
     28 *      -drive file=<file>,if=none,id=<drive_id>
     29 *      -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
     30 *      -device nvme,serial=<serial>,id=<bus_name>, \
     31 *              cmb_size_mb=<cmb_size_mb[optional]>, \
     32 *              [pmrdev=<mem_backend_file_id>,] \
     33 *              max_ioqpairs=<N[optional]>, \
     34 *              aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
     35 *              mdts=<N[optional]>,vsl=<N[optional]>, \
     36 *              zoned.zasl=<N[optional]>, \
     37 *              zoned.auto_transition=<on|off[optional]>, \
     38 *              subsys=<subsys_id>
     39 *      -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
     40 *              zoned=<true|false[optional]>, \
     41 *              subsys=<subsys_id>,detached=<true|false[optional]>
     42 *
     43 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
     44 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
     45 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
     46 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
     47 *
     48 * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
     49 * For example:
     50 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
     51 *  size=<size> .... -device nvme,...,pmrdev=<mem_id>
     52 *
     53 * The PMR will use BAR 4/5 exclusively.
     54 *
     55 * To place controller(s) and namespace(s) to a subsystem, then provide
     56 * nvme-subsys device as above.
     57 *
     58 * nvme subsystem device parameters
     59 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     60 * - `nqn`
     61 *   This parameter provides the `<nqn_id>` part of the string
     62 *   `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
     63 *   of subsystem controllers. Note that `<nqn_id>` should be unique per
     64 *   subsystem, but this is not enforced by QEMU. If not specified, it will
     65 *   default to the value of the `id` parameter (`<subsys_id>`).
     66 *
     67 * nvme device parameters
     68 * ~~~~~~~~~~~~~~~~~~~~~~
     69 * - `subsys`
     70 *   Specifying this parameter attaches the controller to the subsystem and
     71 *   the SUBNQN field in the controller will report the NQN of the subsystem
     72 *   device. This also enables multi controller capability represented in
     73 *   Identify Controller data structure in CMIC (Controller Multi-path I/O and
     74 *   Namesapce Sharing Capabilities).
     75 *
     76 * - `aerl`
     77 *   The Asynchronous Event Request Limit (AERL). Indicates the maximum number
     78 *   of concurrently outstanding Asynchronous Event Request commands support
     79 *   by the controller. This is a 0's based value.
     80 *
     81 * - `aer_max_queued`
     82 *   This is the maximum number of events that the device will enqueue for
     83 *   completion when there are no outstanding AERs. When the maximum number of
     84 *   enqueued events are reached, subsequent events will be dropped.
     85 *
     86 * - `mdts`
     87 *   Indicates the maximum data transfer size for a command that transfers data
     88 *   between host-accessible memory and the controller. The value is specified
     89 *   as a power of two (2^n) and is in units of the minimum memory page size
     90 *   (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
     91 *
     92 * - `vsl`
     93 *   Indicates the maximum data size limit for the Verify command. Like `mdts`,
     94 *   this value is specified as a power of two (2^n) and is in units of the
     95 *   minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
     96 *   KiB).
     97 *
     98 * - `zoned.zasl`
     99 *   Indicates the maximum data transfer size for the Zone Append command. Like
    100 *   `mdts`, the value is specified as a power of two (2^n) and is in units of
    101 *   the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
    102 *   defaulting to the value of `mdts`).
    103 *
    104 * - `zoned.auto_transition`
    105 *   Indicates if zones in zone state implicitly opened can be automatically
    106 *   transitioned to zone state closed for resource management purposes.
    107 *   Defaults to 'on'.
    108 *
    109 * nvme namespace device parameters
    110 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    111 * - `shared`
    112 *   When the parent nvme device (as defined explicitly by the 'bus' parameter
    113 *   or implicitly by the most recently defined NvmeBus) is linked to an
    114 *   nvme-subsys device, the namespace will be attached to all controllers in
    115 *   the subsystem. If set to 'off' (the default), the namespace will remain a
    116 *   private namespace and may only be attached to a single controller at a
    117 *   time.
    118 *
    119 * - `detached`
    120 *   This parameter is only valid together with the `subsys` parameter. If left
    121 *   at the default value (`false/off`), the namespace will be attached to all
    122 *   controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
    123 *   namespace will be available in the subsystem but not attached to any
    124 *   controllers.
    125 *
    126 * Setting `zoned` to true selects Zoned Command Set at the namespace.
    127 * In this case, the following namespace properties are available to configure
    128 * zoned operation:
    129 *     zoned.zone_size=<zone size in bytes, default: 128MiB>
    130 *         The number may be followed by K, M, G as in kilo-, mega- or giga-.
    131 *
    132 *     zoned.zone_capacity=<zone capacity in bytes, default: zone size>
    133 *         The value 0 (default) forces zone capacity to be the same as zone
    134 *         size. The value of this property may not exceed zone size.
    135 *
    136 *     zoned.descr_ext_size=<zone descriptor extension size, default 0>
    137 *         This value needs to be specified in 64B units. If it is zero,
    138 *         namespace(s) will not support zone descriptor extensions.
    139 *
    140 *     zoned.max_active=<Maximum Active Resources (zones), default: 0>
    141 *         The default value means there is no limit to the number of
    142 *         concurrently active zones.
    143 *
    144 *     zoned.max_open=<Maximum Open Resources (zones), default: 0>
    145 *         The default value means there is no limit to the number of
    146 *         concurrently open zones.
    147 *
    148 *     zoned.cross_read=<enable RAZB, default: false>
    149 *         Setting this property to true enables Read Across Zone Boundaries.
    150 */
    151
    152#include "qemu/osdep.h"
    153#include "qemu/cutils.h"
    154#include "qemu/error-report.h"
    155#include "qemu/log.h"
    156#include "qemu/units.h"
    157#include "qapi/error.h"
    158#include "qapi/visitor.h"
    159#include "sysemu/sysemu.h"
    160#include "sysemu/block-backend.h"
    161#include "sysemu/hostmem.h"
    162#include "hw/pci/msix.h"
    163#include "migration/vmstate.h"
    164
    165#include "nvme.h"
    166#include "trace.h"
    167
    168#define NVME_MAX_IOQPAIRS 0xffff
    169#define NVME_DB_SIZE  4
    170#define NVME_SPEC_VER 0x00010400
    171#define NVME_CMB_BIR 2
    172#define NVME_PMR_BIR 4
    173#define NVME_TEMPERATURE 0x143
    174#define NVME_TEMPERATURE_WARNING 0x157
    175#define NVME_TEMPERATURE_CRITICAL 0x175
    176#define NVME_NUM_FW_SLOTS 1
    177#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
    178
    179#define NVME_GUEST_ERR(trace, fmt, ...) \
    180    do { \
    181        (trace_##trace)(__VA_ARGS__); \
    182        qemu_log_mask(LOG_GUEST_ERROR, #trace \
    183            " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
    184    } while (0)
    185
    186static const bool nvme_feature_support[NVME_FID_MAX] = {
    187    [NVME_ARBITRATION]              = true,
    188    [NVME_POWER_MANAGEMENT]         = true,
    189    [NVME_TEMPERATURE_THRESHOLD]    = true,
    190    [NVME_ERROR_RECOVERY]           = true,
    191    [NVME_VOLATILE_WRITE_CACHE]     = true,
    192    [NVME_NUMBER_OF_QUEUES]         = true,
    193    [NVME_INTERRUPT_COALESCING]     = true,
    194    [NVME_INTERRUPT_VECTOR_CONF]    = true,
    195    [NVME_WRITE_ATOMICITY]          = true,
    196    [NVME_ASYNCHRONOUS_EVENT_CONF]  = true,
    197    [NVME_TIMESTAMP]                = true,
    198    [NVME_COMMAND_SET_PROFILE]      = true,
    199};
    200
    201static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
    202    [NVME_TEMPERATURE_THRESHOLD]    = NVME_FEAT_CAP_CHANGE,
    203    [NVME_ERROR_RECOVERY]           = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
    204    [NVME_VOLATILE_WRITE_CACHE]     = NVME_FEAT_CAP_CHANGE,
    205    [NVME_NUMBER_OF_QUEUES]         = NVME_FEAT_CAP_CHANGE,
    206    [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
    207    [NVME_TIMESTAMP]                = NVME_FEAT_CAP_CHANGE,
    208    [NVME_COMMAND_SET_PROFILE]      = NVME_FEAT_CAP_CHANGE,
    209};
    210
    211static const uint32_t nvme_cse_acs[256] = {
    212    [NVME_ADM_CMD_DELETE_SQ]        = NVME_CMD_EFF_CSUPP,
    213    [NVME_ADM_CMD_CREATE_SQ]        = NVME_CMD_EFF_CSUPP,
    214    [NVME_ADM_CMD_GET_LOG_PAGE]     = NVME_CMD_EFF_CSUPP,
    215    [NVME_ADM_CMD_DELETE_CQ]        = NVME_CMD_EFF_CSUPP,
    216    [NVME_ADM_CMD_CREATE_CQ]        = NVME_CMD_EFF_CSUPP,
    217    [NVME_ADM_CMD_IDENTIFY]         = NVME_CMD_EFF_CSUPP,
    218    [NVME_ADM_CMD_ABORT]            = NVME_CMD_EFF_CSUPP,
    219    [NVME_ADM_CMD_SET_FEATURES]     = NVME_CMD_EFF_CSUPP,
    220    [NVME_ADM_CMD_GET_FEATURES]     = NVME_CMD_EFF_CSUPP,
    221    [NVME_ADM_CMD_ASYNC_EV_REQ]     = NVME_CMD_EFF_CSUPP,
    222    [NVME_ADM_CMD_NS_ATTACHMENT]    = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
    223    [NVME_ADM_CMD_FORMAT_NVM]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    224};
    225
    226static const uint32_t nvme_cse_iocs_none[256];
    227
    228static const uint32_t nvme_cse_iocs_nvm[256] = {
    229    [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    230    [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    231    [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    232    [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
    233    [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    234    [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
    235    [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    236    [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
    237};
    238
    239static const uint32_t nvme_cse_iocs_zoned[256] = {
    240    [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    241    [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    242    [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    243    [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
    244    [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    245    [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
    246    [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    247    [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
    248    [NVME_CMD_ZONE_APPEND]          = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    249    [NVME_CMD_ZONE_MGMT_SEND]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    250    [NVME_CMD_ZONE_MGMT_RECV]       = NVME_CMD_EFF_CSUPP,
    251};
    252
    253static void nvme_process_sq(void *opaque);
    254
    255static uint16_t nvme_sqid(NvmeRequest *req)
    256{
    257    return le16_to_cpu(req->sq->sqid);
    258}
    259
    260static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
    261                                   NvmeZoneState state)
    262{
    263    if (QTAILQ_IN_USE(zone, entry)) {
    264        switch (nvme_get_zone_state(zone)) {
    265        case NVME_ZONE_STATE_EXPLICITLY_OPEN:
    266            QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
    267            break;
    268        case NVME_ZONE_STATE_IMPLICITLY_OPEN:
    269            QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
    270            break;
    271        case NVME_ZONE_STATE_CLOSED:
    272            QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
    273            break;
    274        case NVME_ZONE_STATE_FULL:
    275            QTAILQ_REMOVE(&ns->full_zones, zone, entry);
    276        default:
    277            ;
    278        }
    279    }
    280
    281    nvme_set_zone_state(zone, state);
    282
    283    switch (state) {
    284    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
    285        QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
    286        break;
    287    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
    288        QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
    289        break;
    290    case NVME_ZONE_STATE_CLOSED:
    291        QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
    292        break;
    293    case NVME_ZONE_STATE_FULL:
    294        QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
    295    case NVME_ZONE_STATE_READ_ONLY:
    296        break;
    297    default:
    298        zone->d.za = 0;
    299    }
    300}
    301
    302/*
    303 * Check if we can open a zone without exceeding open/active limits.
    304 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
    305 */
    306static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
    307{
    308    if (ns->params.max_active_zones != 0 &&
    309        ns->nr_active_zones + act > ns->params.max_active_zones) {
    310        trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
    311        return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
    312    }
    313    if (ns->params.max_open_zones != 0 &&
    314        ns->nr_open_zones + opn > ns->params.max_open_zones) {
    315        trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
    316        return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
    317    }
    318
    319    return NVME_SUCCESS;
    320}
    321
    322static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
    323{
    324    hwaddr hi, lo;
    325
    326    if (!n->cmb.cmse) {
    327        return false;
    328    }
    329
    330    lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
    331    hi = lo + int128_get64(n->cmb.mem.size);
    332
    333    return addr >= lo && addr < hi;
    334}
    335
    336static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
    337{
    338    hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
    339    return &n->cmb.buf[addr - base];
    340}
    341
    342static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
    343{
    344    hwaddr hi;
    345
    346    if (!n->pmr.cmse) {
    347        return false;
    348    }
    349
    350    hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
    351
    352    return addr >= n->pmr.cba && addr < hi;
    353}
    354
    355static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
    356{
    357    return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
    358}
    359
    360static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
    361{
    362    hwaddr hi = addr + size - 1;
    363    if (hi < addr) {
    364        return 1;
    365    }
    366
    367    if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
    368        memcpy(buf, nvme_addr_to_cmb(n, addr), size);
    369        return 0;
    370    }
    371
    372    if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
    373        memcpy(buf, nvme_addr_to_pmr(n, addr), size);
    374        return 0;
    375    }
    376
    377    return pci_dma_read(&n->parent_obj, addr, buf, size);
    378}
    379
    380static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size)
    381{
    382    hwaddr hi = addr + size - 1;
    383    if (hi < addr) {
    384        return 1;
    385    }
    386
    387    if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
    388        memcpy(nvme_addr_to_cmb(n, addr), buf, size);
    389        return 0;
    390    }
    391
    392    if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
    393        memcpy(nvme_addr_to_pmr(n, addr), buf, size);
    394        return 0;
    395    }
    396
    397    return pci_dma_write(&n->parent_obj, addr, buf, size);
    398}
    399
    400static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
    401{
    402    return nsid &&
    403        (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
    404}
    405
    406static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
    407{
    408    return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
    409}
    410
    411static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
    412{
    413    return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
    414}
    415
    416static void nvme_inc_cq_tail(NvmeCQueue *cq)
    417{
    418    cq->tail++;
    419    if (cq->tail >= cq->size) {
    420        cq->tail = 0;
    421        cq->phase = !cq->phase;
    422    }
    423}
    424
    425static void nvme_inc_sq_head(NvmeSQueue *sq)
    426{
    427    sq->head = (sq->head + 1) % sq->size;
    428}
    429
    430static uint8_t nvme_cq_full(NvmeCQueue *cq)
    431{
    432    return (cq->tail + 1) % cq->size == cq->head;
    433}
    434
    435static uint8_t nvme_sq_empty(NvmeSQueue *sq)
    436{
    437    return sq->head == sq->tail;
    438}
    439
    440static void nvme_irq_check(NvmeCtrl *n)
    441{
    442    uint32_t intms = ldl_le_p(&n->bar.intms);
    443
    444    if (msix_enabled(&(n->parent_obj))) {
    445        return;
    446    }
    447    if (~intms & n->irq_status) {
    448        pci_irq_assert(&n->parent_obj);
    449    } else {
    450        pci_irq_deassert(&n->parent_obj);
    451    }
    452}
    453
    454static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
    455{
    456    if (cq->irq_enabled) {
    457        if (msix_enabled(&(n->parent_obj))) {
    458            trace_pci_nvme_irq_msix(cq->vector);
    459            msix_notify(&(n->parent_obj), cq->vector);
    460        } else {
    461            trace_pci_nvme_irq_pin();
    462            assert(cq->vector < 32);
    463            n->irq_status |= 1 << cq->vector;
    464            nvme_irq_check(n);
    465        }
    466    } else {
    467        trace_pci_nvme_irq_masked();
    468    }
    469}
    470
    471static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
    472{
    473    if (cq->irq_enabled) {
    474        if (msix_enabled(&(n->parent_obj))) {
    475            return;
    476        } else {
    477            assert(cq->vector < 32);
    478            if (!n->cq_pending) {
    479                n->irq_status &= ~(1 << cq->vector);
    480            }
    481            nvme_irq_check(n);
    482        }
    483    }
    484}
    485
    486static void nvme_req_clear(NvmeRequest *req)
    487{
    488    req->ns = NULL;
    489    req->opaque = NULL;
    490    req->aiocb = NULL;
    491    memset(&req->cqe, 0x0, sizeof(req->cqe));
    492    req->status = NVME_SUCCESS;
    493}
    494
    495static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
    496{
    497    if (dma) {
    498        pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
    499        sg->flags = NVME_SG_DMA;
    500    } else {
    501        qemu_iovec_init(&sg->iov, 0);
    502    }
    503
    504    sg->flags |= NVME_SG_ALLOC;
    505}
    506
    507static inline void nvme_sg_unmap(NvmeSg *sg)
    508{
    509    if (!(sg->flags & NVME_SG_ALLOC)) {
    510        return;
    511    }
    512
    513    if (sg->flags & NVME_SG_DMA) {
    514        qemu_sglist_destroy(&sg->qsg);
    515    } else {
    516        qemu_iovec_destroy(&sg->iov);
    517    }
    518
    519    memset(sg, 0x0, sizeof(*sg));
    520}
    521
    522/*
    523 * When metadata is transfered as extended LBAs, the DPTR mapped into `sg`
    524 * holds both data and metadata. This function splits the data and metadata
    525 * into two separate QSG/IOVs.
    526 */
    527static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
    528                          NvmeSg *mdata)
    529{
    530    NvmeSg *dst = data;
    531    uint32_t trans_len, count = ns->lbasz;
    532    uint64_t offset = 0;
    533    bool dma = sg->flags & NVME_SG_DMA;
    534    size_t sge_len;
    535    size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
    536    int sg_idx = 0;
    537
    538    assert(sg->flags & NVME_SG_ALLOC);
    539
    540    while (sg_len) {
    541        sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
    542
    543        trans_len = MIN(sg_len, count);
    544        trans_len = MIN(trans_len, sge_len - offset);
    545
    546        if (dst) {
    547            if (dma) {
    548                qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
    549                                trans_len);
    550            } else {
    551                qemu_iovec_add(&dst->iov,
    552                               sg->iov.iov[sg_idx].iov_base + offset,
    553                               trans_len);
    554            }
    555        }
    556
    557        sg_len -= trans_len;
    558        count -= trans_len;
    559        offset += trans_len;
    560
    561        if (count == 0) {
    562            dst = (dst == data) ? mdata : data;
    563            count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
    564        }
    565
    566        if (sge_len == offset) {
    567            offset = 0;
    568            sg_idx++;
    569        }
    570    }
    571}
    572
    573static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
    574                                  size_t len)
    575{
    576    if (!len) {
    577        return NVME_SUCCESS;
    578    }
    579
    580    trace_pci_nvme_map_addr_cmb(addr, len);
    581
    582    if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
    583        return NVME_DATA_TRAS_ERROR;
    584    }
    585
    586    qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
    587
    588    return NVME_SUCCESS;
    589}
    590
    591static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
    592                                  size_t len)
    593{
    594    if (!len) {
    595        return NVME_SUCCESS;
    596    }
    597
    598    if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
    599        return NVME_DATA_TRAS_ERROR;
    600    }
    601
    602    qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
    603
    604    return NVME_SUCCESS;
    605}
    606
    607static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
    608{
    609    bool cmb = false, pmr = false;
    610
    611    if (!len) {
    612        return NVME_SUCCESS;
    613    }
    614
    615    trace_pci_nvme_map_addr(addr, len);
    616
    617    if (nvme_addr_is_cmb(n, addr)) {
    618        cmb = true;
    619    } else if (nvme_addr_is_pmr(n, addr)) {
    620        pmr = true;
    621    }
    622
    623    if (cmb || pmr) {
    624        if (sg->flags & NVME_SG_DMA) {
    625            return NVME_INVALID_USE_OF_CMB | NVME_DNR;
    626        }
    627
    628        if (sg->iov.niov + 1 > IOV_MAX) {
    629            goto max_mappings_exceeded;
    630        }
    631
    632        if (cmb) {
    633            return nvme_map_addr_cmb(n, &sg->iov, addr, len);
    634        } else {
    635            return nvme_map_addr_pmr(n, &sg->iov, addr, len);
    636        }
    637    }
    638
    639    if (!(sg->flags & NVME_SG_DMA)) {
    640        return NVME_INVALID_USE_OF_CMB | NVME_DNR;
    641    }
    642
    643    if (sg->qsg.nsg + 1 > IOV_MAX) {
    644        goto max_mappings_exceeded;
    645    }
    646
    647    qemu_sglist_add(&sg->qsg, addr, len);
    648
    649    return NVME_SUCCESS;
    650
    651max_mappings_exceeded:
    652    NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
    653                   "number of mappings exceed 1024");
    654    return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
    655}
    656
    657static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
    658{
    659    return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
    660}
    661
    662static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
    663                             uint64_t prp2, uint32_t len)
    664{
    665    hwaddr trans_len = n->page_size - (prp1 % n->page_size);
    666    trans_len = MIN(len, trans_len);
    667    int num_prps = (len >> n->page_bits) + 1;
    668    uint16_t status;
    669    int ret;
    670
    671    trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
    672
    673    nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
    674
    675    status = nvme_map_addr(n, sg, prp1, trans_len);
    676    if (status) {
    677        goto unmap;
    678    }
    679
    680    len -= trans_len;
    681    if (len) {
    682        if (len > n->page_size) {
    683            uint64_t prp_list[n->max_prp_ents];
    684            uint32_t nents, prp_trans;
    685            int i = 0;
    686
    687            /*
    688             * The first PRP list entry, pointed to by PRP2 may contain offset.
    689             * Hence, we need to calculate the number of entries in based on
    690             * that offset.
    691             */
    692            nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
    693            prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
    694            ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
    695            if (ret) {
    696                trace_pci_nvme_err_addr_read(prp2);
    697                status = NVME_DATA_TRAS_ERROR;
    698                goto unmap;
    699            }
    700            while (len != 0) {
    701                uint64_t prp_ent = le64_to_cpu(prp_list[i]);
    702
    703                if (i == nents - 1 && len > n->page_size) {
    704                    if (unlikely(prp_ent & (n->page_size - 1))) {
    705                        trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
    706                        status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
    707                        goto unmap;
    708                    }
    709
    710                    i = 0;
    711                    nents = (len + n->page_size - 1) >> n->page_bits;
    712                    nents = MIN(nents, n->max_prp_ents);
    713                    prp_trans = nents * sizeof(uint64_t);
    714                    ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
    715                                         prp_trans);
    716                    if (ret) {
    717                        trace_pci_nvme_err_addr_read(prp_ent);
    718                        status = NVME_DATA_TRAS_ERROR;
    719                        goto unmap;
    720                    }
    721                    prp_ent = le64_to_cpu(prp_list[i]);
    722                }
    723
    724                if (unlikely(prp_ent & (n->page_size - 1))) {
    725                    trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
    726                    status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
    727                    goto unmap;
    728                }
    729
    730                trans_len = MIN(len, n->page_size);
    731                status = nvme_map_addr(n, sg, prp_ent, trans_len);
    732                if (status) {
    733                    goto unmap;
    734                }
    735
    736                len -= trans_len;
    737                i++;
    738            }
    739        } else {
    740            if (unlikely(prp2 & (n->page_size - 1))) {
    741                trace_pci_nvme_err_invalid_prp2_align(prp2);
    742                status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
    743                goto unmap;
    744            }
    745            status = nvme_map_addr(n, sg, prp2, len);
    746            if (status) {
    747                goto unmap;
    748            }
    749        }
    750    }
    751
    752    return NVME_SUCCESS;
    753
    754unmap:
    755    nvme_sg_unmap(sg);
    756    return status;
    757}
    758
    759/*
    760 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
    761 * number of bytes mapped in len.
    762 */
    763static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
    764                                  NvmeSglDescriptor *segment, uint64_t nsgld,
    765                                  size_t *len, NvmeCmd *cmd)
    766{
    767    dma_addr_t addr, trans_len;
    768    uint32_t dlen;
    769    uint16_t status;
    770
    771    for (int i = 0; i < nsgld; i++) {
    772        uint8_t type = NVME_SGL_TYPE(segment[i].type);
    773
    774        switch (type) {
    775        case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
    776            if (cmd->opcode == NVME_CMD_WRITE) {
    777                continue;
    778            }
    779        case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
    780            break;
    781        case NVME_SGL_DESCR_TYPE_SEGMENT:
    782        case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
    783            return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
    784        default:
    785            return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
    786        }
    787
    788        dlen = le32_to_cpu(segment[i].len);
    789
    790        if (!dlen) {
    791            continue;
    792        }
    793
    794        if (*len == 0) {
    795            /*
    796             * All data has been mapped, but the SGL contains additional
    797             * segments and/or descriptors. The controller might accept
    798             * ignoring the rest of the SGL.
    799             */
    800            uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
    801            if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
    802                break;
    803            }
    804
    805            trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
    806            return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
    807        }
    808
    809        trans_len = MIN(*len, dlen);
    810
    811        if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
    812            goto next;
    813        }
    814
    815        addr = le64_to_cpu(segment[i].addr);
    816
    817        if (UINT64_MAX - addr < dlen) {
    818            return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
    819        }
    820
    821        status = nvme_map_addr(n, sg, addr, trans_len);
    822        if (status) {
    823            return status;
    824        }
    825
    826next:
    827        *len -= trans_len;
    828    }
    829
    830    return NVME_SUCCESS;
    831}
    832
    833static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
    834                             size_t len, NvmeCmd *cmd)
    835{
    836    /*
    837     * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
    838     * dynamically allocating a potentially huge SGL. The spec allows the SGL
    839     * to be larger (as in number of bytes required to describe the SGL
    840     * descriptors and segment chain) than the command transfer size, so it is
    841     * not bounded by MDTS.
    842     */
    843    const int SEG_CHUNK_SIZE = 256;
    844
    845    NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
    846    uint64_t nsgld;
    847    uint32_t seg_len;
    848    uint16_t status;
    849    hwaddr addr;
    850    int ret;
    851
    852    sgld = &sgl;
    853    addr = le64_to_cpu(sgl.addr);
    854
    855    trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
    856
    857    nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
    858
    859    /*
    860     * If the entire transfer can be described with a single data block it can
    861     * be mapped directly.
    862     */
    863    if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
    864        status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
    865        if (status) {
    866            goto unmap;
    867        }
    868
    869        goto out;
    870    }
    871
    872    for (;;) {
    873        switch (NVME_SGL_TYPE(sgld->type)) {
    874        case NVME_SGL_DESCR_TYPE_SEGMENT:
    875        case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
    876            break;
    877        default:
    878            return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
    879        }
    880
    881        seg_len = le32_to_cpu(sgld->len);
    882
    883        /* check the length of the (Last) Segment descriptor */
    884        if ((!seg_len || seg_len & 0xf) &&
    885            (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
    886            return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
    887        }
    888
    889        if (UINT64_MAX - addr < seg_len) {
    890            return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
    891        }
    892
    893        nsgld = seg_len / sizeof(NvmeSglDescriptor);
    894
    895        while (nsgld > SEG_CHUNK_SIZE) {
    896            if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
    897                trace_pci_nvme_err_addr_read(addr);
    898                status = NVME_DATA_TRAS_ERROR;
    899                goto unmap;
    900            }
    901
    902            status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
    903                                       &len, cmd);
    904            if (status) {
    905                goto unmap;
    906            }
    907
    908            nsgld -= SEG_CHUNK_SIZE;
    909            addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
    910        }
    911
    912        ret = nvme_addr_read(n, addr, segment, nsgld *
    913                             sizeof(NvmeSglDescriptor));
    914        if (ret) {
    915            trace_pci_nvme_err_addr_read(addr);
    916            status = NVME_DATA_TRAS_ERROR;
    917            goto unmap;
    918        }
    919
    920        last_sgld = &segment[nsgld - 1];
    921
    922        /*
    923         * If the segment ends with a Data Block or Bit Bucket Descriptor Type,
    924         * then we are done.
    925         */
    926        switch (NVME_SGL_TYPE(last_sgld->type)) {
    927        case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
    928        case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
    929            status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
    930            if (status) {
    931                goto unmap;
    932            }
    933
    934            goto out;
    935
    936        default:
    937            break;
    938        }
    939
    940        /*
    941         * If the last descriptor was not a Data Block or Bit Bucket, then the
    942         * current segment must not be a Last Segment.
    943         */
    944        if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
    945            status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
    946            goto unmap;
    947        }
    948
    949        sgld = last_sgld;
    950        addr = le64_to_cpu(sgld->addr);
    951
    952        /*
    953         * Do not map the last descriptor; it will be a Segment or Last Segment
    954         * descriptor and is handled by the next iteration.
    955         */
    956        status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
    957        if (status) {
    958            goto unmap;
    959        }
    960    }
    961
    962out:
    963    /* if there is any residual left in len, the SGL was too short */
    964    if (len) {
    965        status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
    966        goto unmap;
    967    }
    968
    969    return NVME_SUCCESS;
    970
    971unmap:
    972    nvme_sg_unmap(sg);
    973    return status;
    974}
    975
    976uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
    977                       NvmeCmd *cmd)
    978{
    979    uint64_t prp1, prp2;
    980
    981    switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
    982    case NVME_PSDT_PRP:
    983        prp1 = le64_to_cpu(cmd->dptr.prp1);
    984        prp2 = le64_to_cpu(cmd->dptr.prp2);
    985
    986        return nvme_map_prp(n, sg, prp1, prp2, len);
    987    case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
    988    case NVME_PSDT_SGL_MPTR_SGL:
    989        return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
    990    default:
    991        return NVME_INVALID_FIELD;
    992    }
    993}
    994
    995static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
    996                              NvmeCmd *cmd)
    997{
    998    int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
    999    hwaddr mptr = le64_to_cpu(cmd->mptr);
   1000    uint16_t status;
   1001
   1002    if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
   1003        NvmeSglDescriptor sgl;
   1004
   1005        if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
   1006            return NVME_DATA_TRAS_ERROR;
   1007        }
   1008
   1009        status = nvme_map_sgl(n, sg, sgl, len, cmd);
   1010        if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
   1011            status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
   1012        }
   1013
   1014        return status;
   1015    }
   1016
   1017    nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
   1018    status = nvme_map_addr(n, sg, mptr, len);
   1019    if (status) {
   1020        nvme_sg_unmap(sg);
   1021    }
   1022
   1023    return status;
   1024}
   1025
   1026static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
   1027{
   1028    NvmeNamespace *ns = req->ns;
   1029    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
   1030    bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
   1031    bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
   1032    size_t len = nvme_l2b(ns, nlb);
   1033    uint16_t status;
   1034
   1035    if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
   1036        NvmeSg sg;
   1037
   1038        len += nvme_m2b(ns, nlb);
   1039
   1040        status = nvme_map_dptr(n, &sg, len, &req->cmd);
   1041        if (status) {
   1042            return status;
   1043        }
   1044
   1045        nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
   1046        nvme_sg_split(&sg, ns, &req->sg, NULL);
   1047        nvme_sg_unmap(&sg);
   1048
   1049        return NVME_SUCCESS;
   1050    }
   1051
   1052    return nvme_map_dptr(n, &req->sg, len, &req->cmd);
   1053}
   1054
   1055static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
   1056{
   1057    NvmeNamespace *ns = req->ns;
   1058    size_t len = nvme_m2b(ns, nlb);
   1059    uint16_t status;
   1060
   1061    if (nvme_ns_ext(ns)) {
   1062        NvmeSg sg;
   1063
   1064        len += nvme_l2b(ns, nlb);
   1065
   1066        status = nvme_map_dptr(n, &sg, len, &req->cmd);
   1067        if (status) {
   1068            return status;
   1069        }
   1070
   1071        nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
   1072        nvme_sg_split(&sg, ns, NULL, &req->sg);
   1073        nvme_sg_unmap(&sg);
   1074
   1075        return NVME_SUCCESS;
   1076    }
   1077
   1078    return nvme_map_mptr(n, &req->sg, len, &req->cmd);
   1079}
   1080
   1081static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
   1082                                    uint32_t len, uint32_t bytes,
   1083                                    int32_t skip_bytes, int64_t offset,
   1084                                    NvmeTxDirection dir)
   1085{
   1086    hwaddr addr;
   1087    uint32_t trans_len, count = bytes;
   1088    bool dma = sg->flags & NVME_SG_DMA;
   1089    int64_t sge_len;
   1090    int sg_idx = 0;
   1091    int ret;
   1092
   1093    assert(sg->flags & NVME_SG_ALLOC);
   1094
   1095    while (len) {
   1096        sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
   1097
   1098        if (sge_len - offset < 0) {
   1099            offset -= sge_len;
   1100            sg_idx++;
   1101            continue;
   1102        }
   1103
   1104        if (sge_len == offset) {
   1105            offset = 0;
   1106            sg_idx++;
   1107            continue;
   1108        }
   1109
   1110        trans_len = MIN(len, count);
   1111        trans_len = MIN(trans_len, sge_len - offset);
   1112
   1113        if (dma) {
   1114            addr = sg->qsg.sg[sg_idx].base + offset;
   1115        } else {
   1116            addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
   1117        }
   1118
   1119        if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
   1120            ret = nvme_addr_read(n, addr, ptr, trans_len);
   1121        } else {
   1122            ret = nvme_addr_write(n, addr, ptr, trans_len);
   1123        }
   1124
   1125        if (ret) {
   1126            return NVME_DATA_TRAS_ERROR;
   1127        }
   1128
   1129        ptr += trans_len;
   1130        len -= trans_len;
   1131        count -= trans_len;
   1132        offset += trans_len;
   1133
   1134        if (count == 0) {
   1135            count = bytes;
   1136            offset += skip_bytes;
   1137        }
   1138    }
   1139
   1140    return NVME_SUCCESS;
   1141}
   1142
   1143static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len,
   1144                        NvmeTxDirection dir)
   1145{
   1146    assert(sg->flags & NVME_SG_ALLOC);
   1147
   1148    if (sg->flags & NVME_SG_DMA) {
   1149        uint64_t residual;
   1150
   1151        if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
   1152            residual = dma_buf_write(ptr, len, &sg->qsg);
   1153        } else {
   1154            residual = dma_buf_read(ptr, len, &sg->qsg);
   1155        }
   1156
   1157        if (unlikely(residual)) {
   1158            trace_pci_nvme_err_invalid_dma();
   1159            return NVME_INVALID_FIELD | NVME_DNR;
   1160        }
   1161    } else {
   1162        size_t bytes;
   1163
   1164        if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
   1165            bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
   1166        } else {
   1167            bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
   1168        }
   1169
   1170        if (unlikely(bytes != len)) {
   1171            trace_pci_nvme_err_invalid_dma();
   1172            return NVME_INVALID_FIELD | NVME_DNR;
   1173        }
   1174    }
   1175
   1176    return NVME_SUCCESS;
   1177}
   1178
   1179static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
   1180                                NvmeRequest *req)
   1181{
   1182    uint16_t status;
   1183
   1184    status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
   1185    if (status) {
   1186        return status;
   1187    }
   1188
   1189    return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
   1190}
   1191
   1192static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
   1193                                NvmeRequest *req)
   1194{
   1195    uint16_t status;
   1196
   1197    status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
   1198    if (status) {
   1199        return status;
   1200    }
   1201
   1202    return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
   1203}
   1204
   1205uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
   1206                          NvmeTxDirection dir, NvmeRequest *req)
   1207{
   1208    NvmeNamespace *ns = req->ns;
   1209    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
   1210    bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
   1211    bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
   1212
   1213    if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
   1214        return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
   1215                                   ns->lbaf.ms, 0, dir);
   1216    }
   1217
   1218    return nvme_tx(n, &req->sg, ptr, len, dir);
   1219}
   1220
   1221uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
   1222                           NvmeTxDirection dir, NvmeRequest *req)
   1223{
   1224    NvmeNamespace *ns = req->ns;
   1225    uint16_t status;
   1226
   1227    if (nvme_ns_ext(ns)) {
   1228        return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
   1229                                   ns->lbasz, ns->lbasz, dir);
   1230    }
   1231
   1232    nvme_sg_unmap(&req->sg);
   1233
   1234    status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
   1235    if (status) {
   1236        return status;
   1237    }
   1238
   1239    return nvme_tx(n, &req->sg, ptr, len, dir);
   1240}
   1241
   1242static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
   1243                                 BlockCompletionFunc *cb, NvmeRequest *req)
   1244{
   1245    assert(req->sg.flags & NVME_SG_ALLOC);
   1246
   1247    if (req->sg.flags & NVME_SG_DMA) {
   1248        req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
   1249                                  cb, req);
   1250    } else {
   1251        req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
   1252    }
   1253}
   1254
   1255static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
   1256                                  BlockCompletionFunc *cb, NvmeRequest *req)
   1257{
   1258    assert(req->sg.flags & NVME_SG_ALLOC);
   1259
   1260    if (req->sg.flags & NVME_SG_DMA) {
   1261        req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
   1262                                   cb, req);
   1263    } else {
   1264        req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
   1265    }
   1266}
   1267
   1268static void nvme_post_cqes(void *opaque)
   1269{
   1270    NvmeCQueue *cq = opaque;
   1271    NvmeCtrl *n = cq->ctrl;
   1272    NvmeRequest *req, *next;
   1273    bool pending = cq->head != cq->tail;
   1274    int ret;
   1275
   1276    QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
   1277        NvmeSQueue *sq;
   1278        hwaddr addr;
   1279
   1280        if (nvme_cq_full(cq)) {
   1281            break;
   1282        }
   1283
   1284        sq = req->sq;
   1285        req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
   1286        req->cqe.sq_id = cpu_to_le16(sq->sqid);
   1287        req->cqe.sq_head = cpu_to_le16(sq->head);
   1288        addr = cq->dma_addr + cq->tail * n->cqe_size;
   1289        ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
   1290                            sizeof(req->cqe));
   1291        if (ret) {
   1292            trace_pci_nvme_err_addr_write(addr);
   1293            trace_pci_nvme_err_cfs();
   1294            stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
   1295            break;
   1296        }
   1297        QTAILQ_REMOVE(&cq->req_list, req, entry);
   1298        nvme_inc_cq_tail(cq);
   1299        nvme_sg_unmap(&req->sg);
   1300        QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
   1301    }
   1302    if (cq->tail != cq->head) {
   1303        if (cq->irq_enabled && !pending) {
   1304            n->cq_pending++;
   1305        }
   1306
   1307        nvme_irq_assert(n, cq);
   1308    }
   1309}
   1310
   1311static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
   1312{
   1313    assert(cq->cqid == req->sq->cqid);
   1314    trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
   1315                                          le32_to_cpu(req->cqe.result),
   1316                                          le32_to_cpu(req->cqe.dw1),
   1317                                          req->status);
   1318
   1319    if (req->status) {
   1320        trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
   1321                                      req->status, req->cmd.opcode);
   1322    }
   1323
   1324    QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
   1325    QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
   1326    timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
   1327}
   1328
   1329static void nvme_process_aers(void *opaque)
   1330{
   1331    NvmeCtrl *n = opaque;
   1332    NvmeAsyncEvent *event, *next;
   1333
   1334    trace_pci_nvme_process_aers(n->aer_queued);
   1335
   1336    QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
   1337        NvmeRequest *req;
   1338        NvmeAerResult *result;
   1339
   1340        /* can't post cqe if there is nothing to complete */
   1341        if (!n->outstanding_aers) {
   1342            trace_pci_nvme_no_outstanding_aers();
   1343            break;
   1344        }
   1345
   1346        /* ignore if masked (cqe posted, but event not cleared) */
   1347        if (n->aer_mask & (1 << event->result.event_type)) {
   1348            trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
   1349            continue;
   1350        }
   1351
   1352        QTAILQ_REMOVE(&n->aer_queue, event, entry);
   1353        n->aer_queued--;
   1354
   1355        n->aer_mask |= 1 << event->result.event_type;
   1356        n->outstanding_aers--;
   1357
   1358        req = n->aer_reqs[n->outstanding_aers];
   1359
   1360        result = (NvmeAerResult *) &req->cqe.result;
   1361        result->event_type = event->result.event_type;
   1362        result->event_info = event->result.event_info;
   1363        result->log_page = event->result.log_page;
   1364        g_free(event);
   1365
   1366        trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
   1367                                    result->log_page);
   1368
   1369        nvme_enqueue_req_completion(&n->admin_cq, req);
   1370    }
   1371}
   1372
   1373static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
   1374                               uint8_t event_info, uint8_t log_page)
   1375{
   1376    NvmeAsyncEvent *event;
   1377
   1378    trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
   1379
   1380    if (n->aer_queued == n->params.aer_max_queued) {
   1381        trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
   1382        return;
   1383    }
   1384
   1385    event = g_new(NvmeAsyncEvent, 1);
   1386    event->result = (NvmeAerResult) {
   1387        .event_type = event_type,
   1388        .event_info = event_info,
   1389        .log_page   = log_page,
   1390    };
   1391
   1392    QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
   1393    n->aer_queued++;
   1394
   1395    nvme_process_aers(n);
   1396}
   1397
   1398static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
   1399{
   1400    uint8_t aer_info;
   1401
   1402    /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
   1403    if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
   1404        return;
   1405    }
   1406
   1407    switch (event) {
   1408    case NVME_SMART_SPARE:
   1409        aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
   1410        break;
   1411    case NVME_SMART_TEMPERATURE:
   1412        aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
   1413        break;
   1414    case NVME_SMART_RELIABILITY:
   1415    case NVME_SMART_MEDIA_READ_ONLY:
   1416    case NVME_SMART_FAILED_VOLATILE_MEDIA:
   1417    case NVME_SMART_PMR_UNRELIABLE:
   1418        aer_info = NVME_AER_INFO_SMART_RELIABILITY;
   1419        break;
   1420    default:
   1421        return;
   1422    }
   1423
   1424    nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
   1425}
   1426
   1427static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
   1428{
   1429    n->aer_mask &= ~(1 << event_type);
   1430    if (!QTAILQ_EMPTY(&n->aer_queue)) {
   1431        nvme_process_aers(n);
   1432    }
   1433}
   1434
   1435static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
   1436{
   1437    uint8_t mdts = n->params.mdts;
   1438
   1439    if (mdts && len > n->page_size << mdts) {
   1440        trace_pci_nvme_err_mdts(len);
   1441        return NVME_INVALID_FIELD | NVME_DNR;
   1442    }
   1443
   1444    return NVME_SUCCESS;
   1445}
   1446
   1447static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
   1448                                         uint32_t nlb)
   1449{
   1450    uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
   1451
   1452    if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
   1453        trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
   1454        return NVME_LBA_RANGE | NVME_DNR;
   1455    }
   1456
   1457    return NVME_SUCCESS;
   1458}
   1459
   1460static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
   1461                                 uint32_t nlb, int flags)
   1462{
   1463    BlockDriverState *bs = blk_bs(ns->blkconf.blk);
   1464
   1465    int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
   1466    int64_t offset = nvme_l2b(ns, slba);
   1467    int ret;
   1468
   1469    /*
   1470     * `pnum` holds the number of bytes after offset that shares the same
   1471     * allocation status as the byte at offset. If `pnum` is different from
   1472     * `bytes`, we should check the allocation status of the next range and
   1473     * continue this until all bytes have been checked.
   1474     */
   1475    do {
   1476        bytes -= pnum;
   1477
   1478        ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
   1479        if (ret < 0) {
   1480            return ret;
   1481        }
   1482
   1483
   1484        trace_pci_nvme_block_status(offset, bytes, pnum, ret,
   1485                                    !!(ret & BDRV_BLOCK_ZERO));
   1486
   1487        if (!(ret & flags)) {
   1488            return 1;
   1489        }
   1490
   1491        offset += pnum;
   1492    } while (pnum != bytes);
   1493
   1494    return 0;
   1495}
   1496
   1497static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
   1498                                 uint32_t nlb)
   1499{
   1500    int ret;
   1501    Error *err = NULL;
   1502
   1503    ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
   1504    if (ret) {
   1505        if (ret < 0) {
   1506            error_setg_errno(&err, -ret, "unable to get block status");
   1507            error_report_err(err);
   1508
   1509            return NVME_INTERNAL_DEV_ERROR;
   1510        }
   1511
   1512        return NVME_DULB;
   1513    }
   1514
   1515    return NVME_SUCCESS;
   1516}
   1517
   1518static void nvme_aio_err(NvmeRequest *req, int ret)
   1519{
   1520    uint16_t status = NVME_SUCCESS;
   1521    Error *local_err = NULL;
   1522
   1523    switch (req->cmd.opcode) {
   1524    case NVME_CMD_READ:
   1525        status = NVME_UNRECOVERED_READ;
   1526        break;
   1527    case NVME_CMD_FLUSH:
   1528    case NVME_CMD_WRITE:
   1529    case NVME_CMD_WRITE_ZEROES:
   1530    case NVME_CMD_ZONE_APPEND:
   1531        status = NVME_WRITE_FAULT;
   1532        break;
   1533    default:
   1534        status = NVME_INTERNAL_DEV_ERROR;
   1535        break;
   1536    }
   1537
   1538    trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
   1539
   1540    error_setg_errno(&local_err, -ret, "aio failed");
   1541    error_report_err(local_err);
   1542
   1543    /*
   1544     * Set the command status code to the first encountered error but allow a
   1545     * subsequent Internal Device Error to trump it.
   1546     */
   1547    if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
   1548        return;
   1549    }
   1550
   1551    req->status = status;
   1552}
   1553
   1554static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
   1555{
   1556    return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
   1557                                    slba / ns->zone_size;
   1558}
   1559
   1560static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
   1561{
   1562    uint32_t zone_idx = nvme_zone_idx(ns, slba);
   1563
   1564    if (zone_idx >= ns->num_zones) {
   1565        return NULL;
   1566    }
   1567
   1568    return &ns->zone_array[zone_idx];
   1569}
   1570
   1571static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
   1572{
   1573    uint64_t zslba = zone->d.zslba;
   1574
   1575    switch (nvme_get_zone_state(zone)) {
   1576    case NVME_ZONE_STATE_EMPTY:
   1577    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
   1578    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
   1579    case NVME_ZONE_STATE_CLOSED:
   1580        return NVME_SUCCESS;
   1581    case NVME_ZONE_STATE_FULL:
   1582        trace_pci_nvme_err_zone_is_full(zslba);
   1583        return NVME_ZONE_FULL;
   1584    case NVME_ZONE_STATE_OFFLINE:
   1585        trace_pci_nvme_err_zone_is_offline(zslba);
   1586        return NVME_ZONE_OFFLINE;
   1587    case NVME_ZONE_STATE_READ_ONLY:
   1588        trace_pci_nvme_err_zone_is_read_only(zslba);
   1589        return NVME_ZONE_READ_ONLY;
   1590    default:
   1591        assert(false);
   1592    }
   1593
   1594    return NVME_INTERNAL_DEV_ERROR;
   1595}
   1596
   1597static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
   1598                                      uint64_t slba, uint32_t nlb)
   1599{
   1600    uint64_t zcap = nvme_zone_wr_boundary(zone);
   1601    uint16_t status;
   1602
   1603    status = nvme_check_zone_state_for_write(zone);
   1604    if (status) {
   1605        return status;
   1606    }
   1607
   1608    if (unlikely(slba != zone->w_ptr)) {
   1609        trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
   1610        return NVME_ZONE_INVALID_WRITE;
   1611    }
   1612
   1613    if (unlikely((slba + nlb) > zcap)) {
   1614        trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
   1615        return NVME_ZONE_BOUNDARY_ERROR;
   1616    }
   1617
   1618    return NVME_SUCCESS;
   1619}
   1620
   1621static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
   1622{
   1623    switch (nvme_get_zone_state(zone)) {
   1624    case NVME_ZONE_STATE_EMPTY:
   1625    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
   1626    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
   1627    case NVME_ZONE_STATE_FULL:
   1628    case NVME_ZONE_STATE_CLOSED:
   1629    case NVME_ZONE_STATE_READ_ONLY:
   1630        return NVME_SUCCESS;
   1631    case NVME_ZONE_STATE_OFFLINE:
   1632        trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
   1633        return NVME_ZONE_OFFLINE;
   1634    default:
   1635        assert(false);
   1636    }
   1637
   1638    return NVME_INTERNAL_DEV_ERROR;
   1639}
   1640
   1641static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
   1642                                     uint32_t nlb)
   1643{
   1644    NvmeZone *zone;
   1645    uint64_t bndry, end;
   1646    uint16_t status;
   1647
   1648    zone = nvme_get_zone_by_slba(ns, slba);
   1649    assert(zone);
   1650
   1651    bndry = nvme_zone_rd_boundary(ns, zone);
   1652    end = slba + nlb;
   1653
   1654    status = nvme_check_zone_state_for_read(zone);
   1655    if (status) {
   1656        ;
   1657    } else if (unlikely(end > bndry)) {
   1658        if (!ns->params.cross_zone_read) {
   1659            status = NVME_ZONE_BOUNDARY_ERROR;
   1660        } else {
   1661            /*
   1662             * Read across zone boundary - check that all subsequent
   1663             * zones that are being read have an appropriate state.
   1664             */
   1665            do {
   1666                zone++;
   1667                status = nvme_check_zone_state_for_read(zone);
   1668                if (status) {
   1669                    break;
   1670                }
   1671            } while (end > nvme_zone_rd_boundary(ns, zone));
   1672        }
   1673    }
   1674
   1675    return status;
   1676}
   1677
   1678static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
   1679{
   1680    switch (nvme_get_zone_state(zone)) {
   1681    case NVME_ZONE_STATE_FULL:
   1682        return NVME_SUCCESS;
   1683
   1684    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
   1685    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
   1686        nvme_aor_dec_open(ns);
   1687        /* fallthrough */
   1688    case NVME_ZONE_STATE_CLOSED:
   1689        nvme_aor_dec_active(ns);
   1690        /* fallthrough */
   1691    case NVME_ZONE_STATE_EMPTY:
   1692        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
   1693        return NVME_SUCCESS;
   1694
   1695    default:
   1696        return NVME_ZONE_INVAL_TRANSITION;
   1697    }
   1698}
   1699
   1700static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
   1701{
   1702    switch (nvme_get_zone_state(zone)) {
   1703    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
   1704    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
   1705        nvme_aor_dec_open(ns);
   1706        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
   1707        /* fall through */
   1708    case NVME_ZONE_STATE_CLOSED:
   1709        return NVME_SUCCESS;
   1710
   1711    default:
   1712        return NVME_ZONE_INVAL_TRANSITION;
   1713    }
   1714}
   1715
   1716static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
   1717{
   1718    switch (nvme_get_zone_state(zone)) {
   1719    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
   1720    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
   1721        nvme_aor_dec_open(ns);
   1722        /* fallthrough */
   1723    case NVME_ZONE_STATE_CLOSED:
   1724        nvme_aor_dec_active(ns);
   1725        /* fallthrough */
   1726    case NVME_ZONE_STATE_FULL:
   1727        zone->w_ptr = zone->d.zslba;
   1728        zone->d.wp = zone->w_ptr;
   1729        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
   1730        /* fallthrough */
   1731    case NVME_ZONE_STATE_EMPTY:
   1732        return NVME_SUCCESS;
   1733
   1734    default:
   1735        return NVME_ZONE_INVAL_TRANSITION;
   1736    }
   1737}
   1738
   1739static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
   1740{
   1741    NvmeZone *zone;
   1742
   1743    if (ns->params.max_open_zones &&
   1744        ns->nr_open_zones == ns->params.max_open_zones) {
   1745        zone = QTAILQ_FIRST(&ns->imp_open_zones);
   1746        if (zone) {
   1747            /*
   1748             * Automatically close this implicitly open zone.
   1749             */
   1750            QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
   1751            nvme_zrm_close(ns, zone);
   1752        }
   1753    }
   1754}
   1755
   1756enum {
   1757    NVME_ZRM_AUTO = 1 << 0,
   1758};
   1759
   1760static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
   1761                                    NvmeZone *zone, int flags)
   1762{
   1763    int act = 0;
   1764    uint16_t status;
   1765
   1766    switch (nvme_get_zone_state(zone)) {
   1767    case NVME_ZONE_STATE_EMPTY:
   1768        act = 1;
   1769
   1770        /* fallthrough */
   1771
   1772    case NVME_ZONE_STATE_CLOSED:
   1773        if (n->params.auto_transition_zones) {
   1774            nvme_zrm_auto_transition_zone(ns);
   1775        }
   1776        status = nvme_aor_check(ns, act, 1);
   1777        if (status) {
   1778            return status;
   1779        }
   1780
   1781        if (act) {
   1782            nvme_aor_inc_active(ns);
   1783        }
   1784
   1785        nvme_aor_inc_open(ns);
   1786
   1787        if (flags & NVME_ZRM_AUTO) {
   1788            nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
   1789            return NVME_SUCCESS;
   1790        }
   1791
   1792        /* fallthrough */
   1793
   1794    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
   1795        if (flags & NVME_ZRM_AUTO) {
   1796            return NVME_SUCCESS;
   1797        }
   1798
   1799        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
   1800
   1801        /* fallthrough */
   1802
   1803    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
   1804        return NVME_SUCCESS;
   1805
   1806    default:
   1807        return NVME_ZONE_INVAL_TRANSITION;
   1808    }
   1809}
   1810
   1811static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
   1812                                     NvmeZone *zone)
   1813{
   1814    return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
   1815}
   1816
   1817static inline uint16_t nvme_zrm_open(NvmeCtrl *n, NvmeNamespace *ns,
   1818                                     NvmeZone *zone)
   1819{
   1820    return nvme_zrm_open_flags(n, ns, zone, 0);
   1821}
   1822
   1823static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
   1824                                 uint32_t nlb)
   1825{
   1826    zone->d.wp += nlb;
   1827
   1828    if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
   1829        nvme_zrm_finish(ns, zone);
   1830    }
   1831}
   1832
   1833static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
   1834{
   1835    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
   1836    NvmeZone *zone;
   1837    uint64_t slba;
   1838    uint32_t nlb;
   1839
   1840    slba = le64_to_cpu(rw->slba);
   1841    nlb = le16_to_cpu(rw->nlb) + 1;
   1842    zone = nvme_get_zone_by_slba(ns, slba);
   1843    assert(zone);
   1844
   1845    nvme_advance_zone_wp(ns, zone, nlb);
   1846}
   1847
   1848static inline bool nvme_is_write(NvmeRequest *req)
   1849{
   1850    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
   1851
   1852    return rw->opcode == NVME_CMD_WRITE ||
   1853           rw->opcode == NVME_CMD_ZONE_APPEND ||
   1854           rw->opcode == NVME_CMD_WRITE_ZEROES;
   1855}
   1856
   1857static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
   1858{
   1859    return qemu_get_aio_context();
   1860}
   1861
   1862static void nvme_misc_cb(void *opaque, int ret)
   1863{
   1864    NvmeRequest *req = opaque;
   1865
   1866    trace_pci_nvme_misc_cb(nvme_cid(req));
   1867
   1868    if (ret) {
   1869        nvme_aio_err(req, ret);
   1870    }
   1871
   1872    nvme_enqueue_req_completion(nvme_cq(req), req);
   1873}
   1874
   1875void nvme_rw_complete_cb(void *opaque, int ret)
   1876{
   1877    NvmeRequest *req = opaque;
   1878    NvmeNamespace *ns = req->ns;
   1879    BlockBackend *blk = ns->blkconf.blk;
   1880    BlockAcctCookie *acct = &req->acct;
   1881    BlockAcctStats *stats = blk_get_stats(blk);
   1882
   1883    trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
   1884
   1885    if (ret) {
   1886        block_acct_failed(stats, acct);
   1887        nvme_aio_err(req, ret);
   1888    } else {
   1889        block_acct_done(stats, acct);
   1890    }
   1891
   1892    if (ns->params.zoned && nvme_is_write(req)) {
   1893        nvme_finalize_zoned_write(ns, req);
   1894    }
   1895
   1896    nvme_enqueue_req_completion(nvme_cq(req), req);
   1897}
   1898
   1899static void nvme_rw_cb(void *opaque, int ret)
   1900{
   1901    NvmeRequest *req = opaque;
   1902    NvmeNamespace *ns = req->ns;
   1903
   1904    BlockBackend *blk = ns->blkconf.blk;
   1905
   1906    trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
   1907
   1908    if (ret) {
   1909        goto out;
   1910    }
   1911
   1912    if (ns->lbaf.ms) {
   1913        NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
   1914        uint64_t slba = le64_to_cpu(rw->slba);
   1915        uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
   1916        uint64_t offset = nvme_moff(ns, slba);
   1917
   1918        if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
   1919            size_t mlen = nvme_m2b(ns, nlb);
   1920
   1921            req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
   1922                                               BDRV_REQ_MAY_UNMAP,
   1923                                               nvme_rw_complete_cb, req);
   1924            return;
   1925        }
   1926
   1927        if (nvme_ns_ext(ns) || req->cmd.mptr) {
   1928            uint16_t status;
   1929
   1930            nvme_sg_unmap(&req->sg);
   1931            status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
   1932            if (status) {
   1933                ret = -EFAULT;
   1934                goto out;
   1935            }
   1936
   1937            if (req->cmd.opcode == NVME_CMD_READ) {
   1938                return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
   1939            }
   1940
   1941            return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
   1942        }
   1943    }
   1944
   1945out:
   1946    nvme_rw_complete_cb(req, ret);
   1947}
   1948
   1949static void nvme_verify_cb(void *opaque, int ret)
   1950{
   1951    NvmeBounceContext *ctx = opaque;
   1952    NvmeRequest *req = ctx->req;
   1953    NvmeNamespace *ns = req->ns;
   1954    BlockBackend *blk = ns->blkconf.blk;
   1955    BlockAcctCookie *acct = &req->acct;
   1956    BlockAcctStats *stats = blk_get_stats(blk);
   1957    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
   1958    uint64_t slba = le64_to_cpu(rw->slba);
   1959    uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
   1960    uint16_t apptag = le16_to_cpu(rw->apptag);
   1961    uint16_t appmask = le16_to_cpu(rw->appmask);
   1962    uint32_t reftag = le32_to_cpu(rw->reftag);
   1963    uint16_t status;
   1964
   1965    trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
   1966
   1967    if (ret) {
   1968        block_acct_failed(stats, acct);
   1969        nvme_aio_err(req, ret);
   1970        goto out;
   1971    }
   1972
   1973    block_acct_done(stats, acct);
   1974
   1975    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
   1976        status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
   1977                                       ctx->mdata.iov.size, slba);
   1978        if (status) {
   1979            req->status = status;
   1980            goto out;
   1981        }
   1982
   1983        req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
   1984                                     ctx->mdata.bounce, ctx->mdata.iov.size,
   1985                                     prinfo, slba, apptag, appmask, &reftag);
   1986    }
   1987
   1988out:
   1989    qemu_iovec_destroy(&ctx->data.iov);
   1990    g_free(ctx->data.bounce);
   1991
   1992    qemu_iovec_destroy(&ctx->mdata.iov);
   1993    g_free(ctx->mdata.bounce);
   1994
   1995    g_free(ctx);
   1996
   1997    nvme_enqueue_req_completion(nvme_cq(req), req);
   1998}
   1999
   2000
   2001static void nvme_verify_mdata_in_cb(void *opaque, int ret)
   2002{
   2003    NvmeBounceContext *ctx = opaque;
   2004    NvmeRequest *req = ctx->req;
   2005    NvmeNamespace *ns = req->ns;
   2006    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
   2007    uint64_t slba = le64_to_cpu(rw->slba);
   2008    uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
   2009    size_t mlen = nvme_m2b(ns, nlb);
   2010    uint64_t offset = nvme_moff(ns, slba);
   2011    BlockBackend *blk = ns->blkconf.blk;
   2012
   2013    trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
   2014
   2015    if (ret) {
   2016        goto out;
   2017    }
   2018
   2019    ctx->mdata.bounce = g_malloc(mlen);
   2020
   2021    qemu_iovec_reset(&ctx->mdata.iov);
   2022    qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
   2023
   2024    req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
   2025                                nvme_verify_cb, ctx);
   2026    return;
   2027
   2028out:
   2029    nvme_verify_cb(ctx, ret);
   2030}
   2031
   2032struct nvme_compare_ctx {
   2033    struct {
   2034        QEMUIOVector iov;
   2035        uint8_t *bounce;
   2036    } data;
   2037
   2038    struct {
   2039        QEMUIOVector iov;
   2040        uint8_t *bounce;
   2041    } mdata;
   2042};
   2043
   2044static void nvme_compare_mdata_cb(void *opaque, int ret)
   2045{
   2046    NvmeRequest *req = opaque;
   2047    NvmeNamespace *ns = req->ns;
   2048    NvmeCtrl *n = nvme_ctrl(req);
   2049    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
   2050    uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
   2051    uint16_t apptag = le16_to_cpu(rw->apptag);
   2052    uint16_t appmask = le16_to_cpu(rw->appmask);
   2053    uint32_t reftag = le32_to_cpu(rw->reftag);
   2054    struct nvme_compare_ctx *ctx = req->opaque;
   2055    g_autofree uint8_t *buf = NULL;
   2056    BlockBackend *blk = ns->blkconf.blk;
   2057    BlockAcctCookie *acct = &req->acct;
   2058    BlockAcctStats *stats = blk_get_stats(blk);
   2059    uint16_t status = NVME_SUCCESS;
   2060
   2061    trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
   2062
   2063    if (ret) {
   2064        block_acct_failed(stats, acct);
   2065        nvme_aio_err(req, ret);
   2066        goto out;
   2067    }
   2068
   2069    buf = g_malloc(ctx->mdata.iov.size);
   2070
   2071    status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
   2072                               NVME_TX_DIRECTION_TO_DEVICE, req);
   2073    if (status) {
   2074        req->status = status;
   2075        goto out;
   2076    }
   2077
   2078    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
   2079        uint64_t slba = le64_to_cpu(rw->slba);
   2080        uint8_t *bufp;
   2081        uint8_t *mbufp = ctx->mdata.bounce;
   2082        uint8_t *end = mbufp + ctx->mdata.iov.size;
   2083        int16_t pil = 0;
   2084
   2085        status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
   2086                                ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
   2087                                slba, apptag, appmask, &reftag);
   2088        if (status) {
   2089            req->status = status;
   2090            goto out;
   2091        }
   2092
   2093        /*
   2094         * When formatted with protection information, do not compare the DIF
   2095         * tuple.
   2096         */
   2097        if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
   2098            pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
   2099        }
   2100
   2101        for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
   2102            if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
   2103                req->status = NVME_CMP_FAILURE;
   2104                goto out;
   2105            }
   2106        }
   2107
   2108        goto out;
   2109    }
   2110
   2111    if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
   2112        req->status = NVME_CMP_FAILURE;
   2113        goto out;
   2114    }
   2115
   2116    block_acct_done(stats, acct);
   2117
   2118out:
   2119    qemu_iovec_destroy(&ctx->data.iov);
   2120    g_free(ctx->data.bounce);
   2121
   2122    qemu_iovec_destroy(&ctx->mdata.iov);
   2123    g_free(ctx->mdata.bounce);
   2124
   2125    g_free(ctx);
   2126
   2127    nvme_enqueue_req_completion(nvme_cq(req), req);
   2128}
   2129
   2130static void nvme_compare_data_cb(void *opaque, int ret)
   2131{
   2132    NvmeRequest *req = opaque;
   2133    NvmeCtrl *n = nvme_ctrl(req);
   2134    NvmeNamespace *ns = req->ns;
   2135    BlockBackend *blk = ns->blkconf.blk;
   2136    BlockAcctCookie *acct = &req->acct;
   2137    BlockAcctStats *stats = blk_get_stats(blk);
   2138
   2139    struct nvme_compare_ctx *ctx = req->opaque;
   2140    g_autofree uint8_t *buf = NULL;
   2141    uint16_t status;
   2142
   2143    trace_pci_nvme_compare_data_cb(nvme_cid(req));
   2144
   2145    if (ret) {
   2146        block_acct_failed(stats, acct);
   2147        nvme_aio_err(req, ret);
   2148        goto out;
   2149    }
   2150
   2151    buf = g_malloc(ctx->data.iov.size);
   2152
   2153    status = nvme_bounce_data(n, buf, ctx->data.iov.size,
   2154                              NVME_TX_DIRECTION_TO_DEVICE, req);
   2155    if (status) {
   2156        req->status = status;
   2157        goto out;
   2158    }
   2159
   2160    if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
   2161        req->status = NVME_CMP_FAILURE;
   2162        goto out;
   2163    }
   2164
   2165    if (ns->lbaf.ms) {
   2166        NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
   2167        uint64_t slba = le64_to_cpu(rw->slba);
   2168        uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
   2169        size_t mlen = nvme_m2b(ns, nlb);
   2170        uint64_t offset = nvme_moff(ns, slba);
   2171
   2172        ctx->mdata.bounce = g_malloc(mlen);
   2173
   2174        qemu_iovec_init(&ctx->mdata.iov, 1);
   2175        qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
   2176
   2177        req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
   2178                                    nvme_compare_mdata_cb, req);
   2179        return;
   2180    }
   2181
   2182    block_acct_done(stats, acct);
   2183
   2184out:
   2185    qemu_iovec_destroy(&ctx->data.iov);
   2186    g_free(ctx->data.bounce);
   2187    g_free(ctx);
   2188
   2189    nvme_enqueue_req_completion(nvme_cq(req), req);
   2190}
   2191
   2192typedef struct NvmeDSMAIOCB {
   2193    BlockAIOCB common;
   2194    BlockAIOCB *aiocb;
   2195    NvmeRequest *req;
   2196    QEMUBH *bh;
   2197    int ret;
   2198
   2199    NvmeDsmRange *range;
   2200    unsigned int nr;
   2201    unsigned int idx;
   2202} NvmeDSMAIOCB;
   2203
   2204static void nvme_dsm_cancel(BlockAIOCB *aiocb)
   2205{
   2206    NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
   2207
   2208    /* break nvme_dsm_cb loop */
   2209    iocb->idx = iocb->nr;
   2210    iocb->ret = -ECANCELED;
   2211
   2212    if (iocb->aiocb) {
   2213        blk_aio_cancel_async(iocb->aiocb);
   2214        iocb->aiocb = NULL;
   2215    } else {
   2216        /*
   2217         * We only reach this if nvme_dsm_cancel() has already been called or
   2218         * the command ran to completion and nvme_dsm_bh is scheduled to run.
   2219         */
   2220        assert(iocb->idx == iocb->nr);
   2221    }
   2222}
   2223
   2224static const AIOCBInfo nvme_dsm_aiocb_info = {
   2225    .aiocb_size   = sizeof(NvmeDSMAIOCB),
   2226    .cancel_async = nvme_dsm_cancel,
   2227};
   2228
   2229static void nvme_dsm_bh(void *opaque)
   2230{
   2231    NvmeDSMAIOCB *iocb = opaque;
   2232
   2233    iocb->common.cb(iocb->common.opaque, iocb->ret);
   2234
   2235    qemu_bh_delete(iocb->bh);
   2236    iocb->bh = NULL;
   2237    qemu_aio_unref(iocb);
   2238}
   2239
   2240static void nvme_dsm_cb(void *opaque, int ret);
   2241
   2242static void nvme_dsm_md_cb(void *opaque, int ret)
   2243{
   2244    NvmeDSMAIOCB *iocb = opaque;
   2245    NvmeRequest *req = iocb->req;
   2246    NvmeNamespace *ns = req->ns;
   2247    NvmeDsmRange *range;
   2248    uint64_t slba;
   2249    uint32_t nlb;
   2250
   2251    if (ret < 0) {
   2252        iocb->ret = ret;
   2253        goto done;
   2254    }
   2255
   2256    if (!ns->lbaf.ms) {
   2257        nvme_dsm_cb(iocb, 0);
   2258        return;
   2259    }
   2260
   2261    range = &iocb->range[iocb->idx - 1];
   2262    slba = le64_to_cpu(range->slba);
   2263    nlb = le32_to_cpu(range->nlb);
   2264
   2265    /*
   2266     * Check that all block were discarded (zeroed); otherwise we do not zero
   2267     * the metadata.
   2268     */
   2269
   2270    ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
   2271    if (ret) {
   2272        if (ret < 0) {
   2273            iocb->ret = ret;
   2274            goto done;
   2275        }
   2276
   2277        nvme_dsm_cb(iocb, 0);
   2278    }
   2279
   2280    iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
   2281                                        nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
   2282                                        nvme_dsm_cb, iocb);
   2283    return;
   2284
   2285done:
   2286    iocb->aiocb = NULL;
   2287    qemu_bh_schedule(iocb->bh);
   2288}
   2289
   2290static void nvme_dsm_cb(void *opaque, int ret)
   2291{
   2292    NvmeDSMAIOCB *iocb = opaque;
   2293    NvmeRequest *req = iocb->req;
   2294    NvmeCtrl *n = nvme_ctrl(req);
   2295    NvmeNamespace *ns = req->ns;
   2296    NvmeDsmRange *range;
   2297    uint64_t slba;
   2298    uint32_t nlb;
   2299
   2300    if (ret < 0) {
   2301        iocb->ret = ret;
   2302        goto done;
   2303    }
   2304
   2305next:
   2306    if (iocb->idx == iocb->nr) {
   2307        goto done;
   2308    }
   2309
   2310    range = &iocb->range[iocb->idx++];
   2311    slba = le64_to_cpu(range->slba);
   2312    nlb = le32_to_cpu(range->nlb);
   2313
   2314    trace_pci_nvme_dsm_deallocate(slba, nlb);
   2315
   2316    if (nlb > n->dmrsl) {
   2317        trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
   2318        goto next;
   2319    }
   2320
   2321    if (nvme_check_bounds(ns, slba, nlb)) {
   2322        trace_pci_nvme_err_invalid_lba_range(slba, nlb,
   2323                                             ns->id_ns.nsze);
   2324        goto next;
   2325    }
   2326
   2327    iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
   2328                                   nvme_l2b(ns, nlb),
   2329                                   nvme_dsm_md_cb, iocb);
   2330    return;
   2331
   2332done:
   2333    iocb->aiocb = NULL;
   2334    qemu_bh_schedule(iocb->bh);
   2335}
   2336
   2337static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
   2338{
   2339    NvmeNamespace *ns = req->ns;
   2340    NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
   2341    uint32_t attr = le32_to_cpu(dsm->attributes);
   2342    uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
   2343    uint16_t status = NVME_SUCCESS;
   2344
   2345    trace_pci_nvme_dsm(nr, attr);
   2346
   2347    if (attr & NVME_DSMGMT_AD) {
   2348        NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
   2349                                         nvme_misc_cb, req);
   2350
   2351        iocb->req = req;
   2352        iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
   2353        iocb->ret = 0;
   2354        iocb->range = g_new(NvmeDsmRange, nr);
   2355        iocb->nr = nr;
   2356        iocb->idx = 0;
   2357
   2358        status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
   2359                          req);
   2360        if (status) {
   2361            return status;
   2362        }
   2363
   2364        req->aiocb = &iocb->common;
   2365        nvme_dsm_cb(iocb, 0);
   2366
   2367        return NVME_NO_COMPLETE;
   2368    }
   2369
   2370    return status;
   2371}
   2372
   2373static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
   2374{
   2375    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
   2376    NvmeNamespace *ns = req->ns;
   2377    BlockBackend *blk = ns->blkconf.blk;
   2378    uint64_t slba = le64_to_cpu(rw->slba);
   2379    uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
   2380    size_t len = nvme_l2b(ns, nlb);
   2381    int64_t offset = nvme_l2b(ns, slba);
   2382    uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
   2383    uint32_t reftag = le32_to_cpu(rw->reftag);
   2384    NvmeBounceContext *ctx = NULL;
   2385    uint16_t status;
   2386
   2387    trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
   2388
   2389    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
   2390        status = nvme_check_prinfo(ns, prinfo, slba, reftag);
   2391        if (status) {
   2392            return status;
   2393        }
   2394
   2395        if (prinfo & NVME_PRINFO_PRACT) {
   2396            return NVME_INVALID_PROT_INFO | NVME_DNR;
   2397        }
   2398    }
   2399
   2400    if (len > n->page_size << n->params.vsl) {
   2401        return NVME_INVALID_FIELD | NVME_DNR;
   2402    }
   2403
   2404    status = nvme_check_bounds(ns, slba, nlb);
   2405    if (status) {
   2406        return status;
   2407    }
   2408
   2409    if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
   2410        status = nvme_check_dulbe(ns, slba, nlb);
   2411        if (status) {
   2412            return status;
   2413        }
   2414    }
   2415
   2416    ctx = g_new0(NvmeBounceContext, 1);
   2417    ctx->req = req;
   2418
   2419    ctx->data.bounce = g_malloc(len);
   2420
   2421    qemu_iovec_init(&ctx->data.iov, 1);
   2422    qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
   2423
   2424    block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
   2425                     BLOCK_ACCT_READ);
   2426
   2427    req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
   2428                                nvme_verify_mdata_in_cb, ctx);
   2429    return NVME_NO_COMPLETE;
   2430}
   2431
   2432typedef struct NvmeCopyAIOCB {
   2433    BlockAIOCB common;
   2434    BlockAIOCB *aiocb;
   2435    NvmeRequest *req;
   2436    QEMUBH *bh;
   2437    int ret;
   2438
   2439    NvmeCopySourceRange *ranges;
   2440    int nr;
   2441    int idx;
   2442
   2443    uint8_t *bounce;
   2444    QEMUIOVector iov;
   2445    struct {
   2446        BlockAcctCookie read;
   2447        BlockAcctCookie write;
   2448    } acct;
   2449
   2450    uint32_t reftag;
   2451    uint64_t slba;
   2452
   2453    NvmeZone *zone;
   2454} NvmeCopyAIOCB;
   2455
   2456static void nvme_copy_cancel(BlockAIOCB *aiocb)
   2457{
   2458    NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
   2459
   2460    iocb->ret = -ECANCELED;
   2461
   2462    if (iocb->aiocb) {
   2463        blk_aio_cancel_async(iocb->aiocb);
   2464        iocb->aiocb = NULL;
   2465    }
   2466}
   2467
   2468static const AIOCBInfo nvme_copy_aiocb_info = {
   2469    .aiocb_size   = sizeof(NvmeCopyAIOCB),
   2470    .cancel_async = nvme_copy_cancel,
   2471};
   2472
   2473static void nvme_copy_bh(void *opaque)
   2474{
   2475    NvmeCopyAIOCB *iocb = opaque;
   2476    NvmeRequest *req = iocb->req;
   2477    NvmeNamespace *ns = req->ns;
   2478    BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
   2479
   2480    if (iocb->idx != iocb->nr) {
   2481        req->cqe.result = cpu_to_le32(iocb->idx);
   2482    }
   2483
   2484    qemu_iovec_destroy(&iocb->iov);
   2485    g_free(iocb->bounce);
   2486
   2487    qemu_bh_delete(iocb->bh);
   2488    iocb->bh = NULL;
   2489
   2490    if (iocb->ret < 0) {
   2491        block_acct_failed(stats, &iocb->acct.read);
   2492        block_acct_failed(stats, &iocb->acct.write);
   2493    } else {
   2494        block_acct_done(stats, &iocb->acct.read);
   2495        block_acct_done(stats, &iocb->acct.write);
   2496    }
   2497
   2498    iocb->common.cb(iocb->common.opaque, iocb->ret);
   2499    qemu_aio_unref(iocb);
   2500}
   2501
   2502static void nvme_copy_cb(void *opaque, int ret);
   2503
   2504static void nvme_copy_out_completed_cb(void *opaque, int ret)
   2505{
   2506    NvmeCopyAIOCB *iocb = opaque;
   2507    NvmeRequest *req = iocb->req;
   2508    NvmeNamespace *ns = req->ns;
   2509    NvmeCopySourceRange *range = &iocb->ranges[iocb->idx];
   2510    uint32_t nlb = le32_to_cpu(range->nlb) + 1;
   2511
   2512    if (ret < 0) {
   2513        iocb->ret = ret;
   2514        goto out;
   2515    } else if (iocb->ret < 0) {
   2516        goto out;
   2517    }
   2518
   2519    if (ns->params.zoned) {
   2520        nvme_advance_zone_wp(ns, iocb->zone, nlb);
   2521    }
   2522
   2523    iocb->idx++;
   2524    iocb->slba += nlb;
   2525out:
   2526    nvme_copy_cb(iocb, iocb->ret);
   2527}
   2528
   2529static void nvme_copy_out_cb(void *opaque, int ret)
   2530{
   2531    NvmeCopyAIOCB *iocb = opaque;
   2532    NvmeRequest *req = iocb->req;
   2533    NvmeNamespace *ns = req->ns;
   2534    NvmeCopySourceRange *range;
   2535    uint32_t nlb;
   2536    size_t mlen;
   2537    uint8_t *mbounce;
   2538
   2539    if (ret < 0) {
   2540        iocb->ret = ret;
   2541        goto out;
   2542    } else if (iocb->ret < 0) {
   2543        goto out;
   2544    }
   2545
   2546    if (!ns->lbaf.ms) {
   2547        nvme_copy_out_completed_cb(iocb, 0);
   2548        return;
   2549    }
   2550
   2551    range = &iocb->ranges[iocb->idx];
   2552    nlb = le32_to_cpu(range->nlb) + 1;
   2553
   2554    mlen = nvme_m2b(ns, nlb);
   2555    mbounce = iocb->bounce + nvme_l2b(ns, nlb);
   2556
   2557    qemu_iovec_reset(&iocb->iov);
   2558    qemu_iovec_add(&iocb->iov, mbounce, mlen);
   2559
   2560    iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_moff(ns, iocb->slba),
   2561                                  &iocb->iov, 0, nvme_copy_out_completed_cb,
   2562                                  iocb);
   2563
   2564    return;
   2565
   2566out:
   2567    nvme_copy_cb(iocb, ret);
   2568}
   2569
   2570static void nvme_copy_in_completed_cb(void *opaque, int ret)
   2571{
   2572    NvmeCopyAIOCB *iocb = opaque;
   2573    NvmeRequest *req = iocb->req;
   2574    NvmeNamespace *ns = req->ns;
   2575    NvmeCopySourceRange *range;
   2576    uint32_t nlb;
   2577    size_t len;
   2578    uint16_t status;
   2579
   2580    if (ret < 0) {
   2581        iocb->ret = ret;
   2582        goto out;
   2583    } else if (iocb->ret < 0) {
   2584        goto out;
   2585    }
   2586
   2587    range = &iocb->ranges[iocb->idx];
   2588    nlb = le32_to_cpu(range->nlb) + 1;
   2589    len = nvme_l2b(ns, nlb);
   2590
   2591    trace_pci_nvme_copy_out(iocb->slba, nlb);
   2592
   2593    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
   2594        NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
   2595
   2596        uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
   2597        uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
   2598
   2599        uint16_t apptag = le16_to_cpu(range->apptag);
   2600        uint16_t appmask = le16_to_cpu(range->appmask);
   2601        uint32_t reftag = le32_to_cpu(range->reftag);
   2602
   2603        uint64_t slba = le64_to_cpu(range->slba);
   2604        size_t mlen = nvme_m2b(ns, nlb);
   2605        uint8_t *mbounce = iocb->bounce + nvme_l2b(ns, nlb);
   2606
   2607        status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, prinfor,
   2608                                slba, apptag, appmask, &reftag);
   2609        if (status) {
   2610            goto invalid;
   2611        }
   2612
   2613        apptag = le16_to_cpu(copy->apptag);
   2614        appmask = le16_to_cpu(copy->appmask);
   2615
   2616        if (prinfow & NVME_PRINFO_PRACT) {
   2617            status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag);
   2618            if (status) {
   2619                goto invalid;
   2620            }
   2621
   2622            nvme_dif_pract_generate_dif(ns, iocb->bounce, len, mbounce, mlen,
   2623                                        apptag, &iocb->reftag);
   2624        } else {
   2625            status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen,
   2626                                    prinfow, iocb->slba, apptag, appmask,
   2627                                    &iocb->reftag);
   2628            if (status) {
   2629                goto invalid;
   2630            }
   2631        }
   2632    }
   2633
   2634    status = nvme_check_bounds(ns, iocb->slba, nlb);
   2635    if (status) {
   2636        goto invalid;
   2637    }
   2638
   2639    if (ns->params.zoned) {
   2640        status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb);
   2641        if (status) {
   2642            goto invalid;
   2643        }
   2644
   2645        iocb->zone->w_ptr += nlb;
   2646    }
   2647
   2648    qemu_iovec_reset(&iocb->iov);
   2649    qemu_iovec_add(&iocb->iov, iocb->bounce, len);
   2650
   2651    iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba),
   2652                                  &iocb->iov, 0, nvme_copy_out_cb, iocb);
   2653
   2654    return;
   2655
   2656invalid:
   2657    req->status = status;
   2658    iocb->aiocb = NULL;
   2659    if (iocb->bh) {
   2660        qemu_bh_schedule(iocb->bh);
   2661    }
   2662
   2663    return;
   2664
   2665out:
   2666    nvme_copy_cb(iocb, ret);
   2667}
   2668
   2669static void nvme_copy_in_cb(void *opaque, int ret)
   2670{
   2671    NvmeCopyAIOCB *iocb = opaque;
   2672    NvmeRequest *req = iocb->req;
   2673    NvmeNamespace *ns = req->ns;
   2674    NvmeCopySourceRange *range;
   2675    uint64_t slba;
   2676    uint32_t nlb;
   2677
   2678    if (ret < 0) {
   2679        iocb->ret = ret;
   2680        goto out;
   2681    } else if (iocb->ret < 0) {
   2682        goto out;
   2683    }
   2684
   2685    if (!ns->lbaf.ms) {
   2686        nvme_copy_in_completed_cb(iocb, 0);
   2687        return;
   2688    }
   2689
   2690    range = &iocb->ranges[iocb->idx];
   2691    slba = le64_to_cpu(range->slba);
   2692    nlb = le32_to_cpu(range->nlb) + 1;
   2693
   2694    qemu_iovec_reset(&iocb->iov);
   2695    qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(ns, nlb),
   2696                   nvme_m2b(ns, nlb));
   2697
   2698    iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_moff(ns, slba),
   2699                                 &iocb->iov, 0, nvme_copy_in_completed_cb,
   2700                                 iocb);
   2701    return;
   2702
   2703out:
   2704    nvme_copy_cb(iocb, iocb->ret);
   2705}
   2706
   2707static void nvme_copy_cb(void *opaque, int ret)
   2708{
   2709    NvmeCopyAIOCB *iocb = opaque;
   2710    NvmeRequest *req = iocb->req;
   2711    NvmeNamespace *ns = req->ns;
   2712    NvmeCopySourceRange *range;
   2713    uint64_t slba;
   2714    uint32_t nlb;
   2715    size_t len;
   2716    uint16_t status;
   2717
   2718    if (ret < 0) {
   2719        iocb->ret = ret;
   2720        goto done;
   2721    } else if (iocb->ret < 0) {
   2722        goto done;
   2723    }
   2724
   2725    if (iocb->idx == iocb->nr) {
   2726        goto done;
   2727    }
   2728
   2729    range = &iocb->ranges[iocb->idx];
   2730    slba = le64_to_cpu(range->slba);
   2731    nlb = le32_to_cpu(range->nlb) + 1;
   2732    len = nvme_l2b(ns, nlb);
   2733
   2734    trace_pci_nvme_copy_source_range(slba, nlb);
   2735
   2736    if (nlb > le16_to_cpu(ns->id_ns.mssrl)) {
   2737        status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
   2738        goto invalid;
   2739    }
   2740
   2741    status = nvme_check_bounds(ns, slba, nlb);
   2742    if (status) {
   2743        goto invalid;
   2744    }
   2745
   2746    if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
   2747        status = nvme_check_dulbe(ns, slba, nlb);
   2748        if (status) {
   2749            goto invalid;
   2750        }
   2751    }
   2752
   2753    if (ns->params.zoned) {
   2754        status = nvme_check_zone_read(ns, slba, nlb);
   2755        if (status) {
   2756            goto invalid;
   2757        }
   2758    }
   2759
   2760    qemu_iovec_reset(&iocb->iov);
   2761    qemu_iovec_add(&iocb->iov, iocb->bounce, len);
   2762
   2763    iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
   2764                                 &iocb->iov, 0, nvme_copy_in_cb, iocb);
   2765    return;
   2766
   2767invalid:
   2768    req->status = status;
   2769done:
   2770    iocb->aiocb = NULL;
   2771    if (iocb->bh) {
   2772        qemu_bh_schedule(iocb->bh);
   2773    }
   2774}
   2775
   2776
   2777static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
   2778{
   2779    NvmeNamespace *ns = req->ns;
   2780    NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
   2781    NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
   2782                                      nvme_misc_cb, req);
   2783    uint16_t nr = copy->nr + 1;
   2784    uint8_t format = copy->control[0] & 0xf;
   2785    uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
   2786    uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
   2787
   2788    uint16_t status;
   2789
   2790    trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
   2791
   2792    iocb->ranges = NULL;
   2793    iocb->zone = NULL;
   2794
   2795    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
   2796        ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
   2797        status = NVME_INVALID_FIELD | NVME_DNR;
   2798        goto invalid;
   2799    }
   2800
   2801    if (!(n->id_ctrl.ocfs & (1 << format))) {
   2802        trace_pci_nvme_err_copy_invalid_format(format);
   2803        status = NVME_INVALID_FIELD | NVME_DNR;
   2804        goto invalid;
   2805    }
   2806
   2807    if (nr > ns->id_ns.msrc + 1) {
   2808        status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
   2809        goto invalid;
   2810    }
   2811
   2812    iocb->ranges = g_new(NvmeCopySourceRange, nr);
   2813
   2814    status = nvme_h2c(n, (uint8_t *)iocb->ranges,
   2815                      sizeof(NvmeCopySourceRange) * nr, req);
   2816    if (status) {
   2817        goto invalid;
   2818    }
   2819
   2820    iocb->slba = le64_to_cpu(copy->sdlba);
   2821
   2822    if (ns->params.zoned) {
   2823        iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
   2824        if (!iocb->zone) {
   2825            status = NVME_LBA_RANGE | NVME_DNR;
   2826            goto invalid;
   2827        }
   2828
   2829        status = nvme_zrm_auto(n, ns, iocb->zone);
   2830        if (status) {
   2831            goto invalid;
   2832        }
   2833    }
   2834
   2835    iocb->req = req;
   2836    iocb->bh = qemu_bh_new(nvme_copy_bh, iocb);
   2837    iocb->ret = 0;
   2838    iocb->nr = nr;
   2839    iocb->idx = 0;
   2840    iocb->reftag = le32_to_cpu(copy->reftag);
   2841    iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl),
   2842                              ns->lbasz + ns->lbaf.ms);
   2843
   2844    qemu_iovec_init(&iocb->iov, 1);
   2845
   2846    block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0,
   2847                     BLOCK_ACCT_READ);
   2848    block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0,
   2849                     BLOCK_ACCT_WRITE);
   2850
   2851    req->aiocb = &iocb->common;
   2852    nvme_copy_cb(iocb, 0);
   2853
   2854    return NVME_NO_COMPLETE;
   2855
   2856invalid:
   2857    g_free(iocb->ranges);
   2858    qemu_aio_unref(iocb);
   2859    return status;
   2860}
   2861
   2862static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
   2863{
   2864    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
   2865    NvmeNamespace *ns = req->ns;
   2866    BlockBackend *blk = ns->blkconf.blk;
   2867    uint64_t slba = le64_to_cpu(rw->slba);
   2868    uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
   2869    uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
   2870    size_t data_len = nvme_l2b(ns, nlb);
   2871    size_t len = data_len;
   2872    int64_t offset = nvme_l2b(ns, slba);
   2873    struct nvme_compare_ctx *ctx = NULL;
   2874    uint16_t status;
   2875
   2876    trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
   2877
   2878    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
   2879        return NVME_INVALID_PROT_INFO | NVME_DNR;
   2880    }
   2881
   2882    if (nvme_ns_ext(ns)) {
   2883        len += nvme_m2b(ns, nlb);
   2884    }
   2885
   2886    status = nvme_check_mdts(n, len);
   2887    if (status) {
   2888        return status;
   2889    }
   2890
   2891    status = nvme_check_bounds(ns, slba, nlb);
   2892    if (status) {
   2893        return status;
   2894    }
   2895
   2896    if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
   2897        status = nvme_check_dulbe(ns, slba, nlb);
   2898        if (status) {
   2899            return status;
   2900        }
   2901    }
   2902
   2903    status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
   2904    if (status) {
   2905        return status;
   2906    }
   2907
   2908    ctx = g_new(struct nvme_compare_ctx, 1);
   2909    ctx->data.bounce = g_malloc(data_len);
   2910
   2911    req->opaque = ctx;
   2912
   2913    qemu_iovec_init(&ctx->data.iov, 1);
   2914    qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
   2915
   2916    block_acct_start(blk_get_stats(blk), &req->acct, data_len,
   2917                     BLOCK_ACCT_READ);
   2918    req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
   2919                                nvme_compare_data_cb, req);
   2920
   2921    return NVME_NO_COMPLETE;
   2922}
   2923
   2924typedef struct NvmeFlushAIOCB {
   2925    BlockAIOCB common;
   2926    BlockAIOCB *aiocb;
   2927    NvmeRequest *req;
   2928    QEMUBH *bh;
   2929    int ret;
   2930
   2931    NvmeNamespace *ns;
   2932    uint32_t nsid;
   2933    bool broadcast;
   2934} NvmeFlushAIOCB;
   2935
   2936static void nvme_flush_cancel(BlockAIOCB *acb)
   2937{
   2938    NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
   2939
   2940    iocb->ret = -ECANCELED;
   2941
   2942    if (iocb->aiocb) {
   2943        blk_aio_cancel_async(iocb->aiocb);
   2944    }
   2945}
   2946
   2947static const AIOCBInfo nvme_flush_aiocb_info = {
   2948    .aiocb_size = sizeof(NvmeFlushAIOCB),
   2949    .cancel_async = nvme_flush_cancel,
   2950    .get_aio_context = nvme_get_aio_context,
   2951};
   2952
   2953static void nvme_flush_ns_cb(void *opaque, int ret)
   2954{
   2955    NvmeFlushAIOCB *iocb = opaque;
   2956    NvmeNamespace *ns = iocb->ns;
   2957
   2958    if (ret < 0) {
   2959        iocb->ret = ret;
   2960        goto out;
   2961    } else if (iocb->ret < 0) {
   2962        goto out;
   2963    }
   2964
   2965    if (ns) {
   2966        trace_pci_nvme_flush_ns(iocb->nsid);
   2967
   2968        iocb->ns = NULL;
   2969        iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
   2970        return;
   2971    }
   2972
   2973out:
   2974    iocb->aiocb = NULL;
   2975    qemu_bh_schedule(iocb->bh);
   2976}
   2977
   2978static void nvme_flush_bh(void *opaque)
   2979{
   2980    NvmeFlushAIOCB *iocb = opaque;
   2981    NvmeRequest *req = iocb->req;
   2982    NvmeCtrl *n = nvme_ctrl(req);
   2983    int i;
   2984
   2985    if (iocb->ret < 0) {
   2986        goto done;
   2987    }
   2988
   2989    if (iocb->broadcast) {
   2990        for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
   2991            iocb->ns = nvme_ns(n, i);
   2992            if (iocb->ns) {
   2993                iocb->nsid = i;
   2994                break;
   2995            }
   2996        }
   2997    }
   2998
   2999    if (!iocb->ns) {
   3000        goto done;
   3001    }
   3002
   3003    nvme_flush_ns_cb(iocb, 0);
   3004    return;
   3005
   3006done:
   3007    qemu_bh_delete(iocb->bh);
   3008    iocb->bh = NULL;
   3009
   3010    iocb->common.cb(iocb->common.opaque, iocb->ret);
   3011
   3012    qemu_aio_unref(iocb);
   3013
   3014    return;
   3015}
   3016
   3017static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
   3018{
   3019    NvmeFlushAIOCB *iocb;
   3020    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
   3021    uint16_t status;
   3022
   3023    iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
   3024
   3025    iocb->req = req;
   3026    iocb->bh = qemu_bh_new(nvme_flush_bh, iocb);
   3027    iocb->ret = 0;
   3028    iocb->ns = NULL;
   3029    iocb->nsid = 0;
   3030    iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
   3031
   3032    if (!iocb->broadcast) {
   3033        if (!nvme_nsid_valid(n, nsid)) {
   3034            status = NVME_INVALID_NSID | NVME_DNR;
   3035            goto out;
   3036        }
   3037
   3038        iocb->ns = nvme_ns(n, nsid);
   3039        if (!iocb->ns) {
   3040            status = NVME_INVALID_FIELD | NVME_DNR;
   3041            goto out;
   3042        }
   3043
   3044        iocb->nsid = nsid;
   3045    }
   3046
   3047    req->aiocb = &iocb->common;
   3048    qemu_bh_schedule(iocb->bh);
   3049
   3050    return NVME_NO_COMPLETE;
   3051
   3052out:
   3053    qemu_bh_delete(iocb->bh);
   3054    iocb->bh = NULL;
   3055    qemu_aio_unref(iocb);
   3056
   3057    return status;
   3058}
   3059
   3060static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
   3061{
   3062    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
   3063    NvmeNamespace *ns = req->ns;
   3064    uint64_t slba = le64_to_cpu(rw->slba);
   3065    uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
   3066    uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
   3067    uint64_t data_size = nvme_l2b(ns, nlb);
   3068    uint64_t mapped_size = data_size;
   3069    uint64_t data_offset;
   3070    BlockBackend *blk = ns->blkconf.blk;
   3071    uint16_t status;
   3072
   3073    if (nvme_ns_ext(ns)) {
   3074        mapped_size += nvme_m2b(ns, nlb);
   3075
   3076        if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
   3077            bool pract = prinfo & NVME_PRINFO_PRACT;
   3078
   3079            if (pract && ns->lbaf.ms == 8) {
   3080                mapped_size = data_size;
   3081            }
   3082        }
   3083    }
   3084
   3085    trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
   3086
   3087    status = nvme_check_mdts(n, mapped_size);
   3088    if (status) {
   3089        goto invalid;
   3090    }
   3091
   3092    status = nvme_check_bounds(ns, slba, nlb);
   3093    if (status) {
   3094        goto invalid;
   3095    }
   3096
   3097    if (ns->params.zoned) {
   3098        status = nvme_check_zone_read(ns, slba, nlb);
   3099        if (status) {
   3100            trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
   3101            goto invalid;
   3102        }
   3103    }
   3104
   3105    if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
   3106        status = nvme_check_dulbe(ns, slba, nlb);
   3107        if (status) {
   3108            goto invalid;
   3109        }
   3110    }
   3111
   3112    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
   3113        return nvme_dif_rw(n, req);
   3114    }
   3115
   3116    status = nvme_map_data(n, nlb, req);
   3117    if (status) {
   3118        goto invalid;
   3119    }
   3120
   3121    data_offset = nvme_l2b(ns, slba);
   3122
   3123    block_acct_start(blk_get_stats(blk), &req->acct, data_size,
   3124                     BLOCK_ACCT_READ);
   3125    nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
   3126    return NVME_NO_COMPLETE;
   3127
   3128invalid:
   3129    block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
   3130    return status | NVME_DNR;
   3131}
   3132
   3133static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
   3134                              bool wrz)
   3135{
   3136    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
   3137    NvmeNamespace *ns = req->ns;
   3138    uint64_t slba = le64_to_cpu(rw->slba);
   3139    uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
   3140    uint16_t ctrl = le16_to_cpu(rw->control);
   3141    uint8_t prinfo = NVME_RW_PRINFO(ctrl);
   3142    uint64_t data_size = nvme_l2b(ns, nlb);
   3143    uint64_t mapped_size = data_size;
   3144    uint64_t data_offset;
   3145    NvmeZone *zone;
   3146    NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
   3147    BlockBackend *blk = ns->blkconf.blk;
   3148    uint16_t status;
   3149
   3150    if (nvme_ns_ext(ns)) {
   3151        mapped_size += nvme_m2b(ns, nlb);
   3152
   3153        if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
   3154            bool pract = prinfo & NVME_PRINFO_PRACT;
   3155
   3156            if (pract && ns->lbaf.ms == 8) {
   3157                mapped_size -= nvme_m2b(ns, nlb);
   3158            }
   3159        }
   3160    }
   3161
   3162    trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
   3163                         nvme_nsid(ns), nlb, mapped_size, slba);
   3164
   3165    if (!wrz) {
   3166        status = nvme_check_mdts(n, mapped_size);
   3167        if (status) {
   3168            goto invalid;
   3169        }
   3170    }
   3171
   3172    status = nvme_check_bounds(ns, slba, nlb);
   3173    if (status) {
   3174        goto invalid;
   3175    }
   3176
   3177    if (ns->params.zoned) {
   3178        zone = nvme_get_zone_by_slba(ns, slba);
   3179        assert(zone);
   3180
   3181        if (append) {
   3182            bool piremap = !!(ctrl & NVME_RW_PIREMAP);
   3183
   3184            if (unlikely(slba != zone->d.zslba)) {
   3185                trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
   3186                status = NVME_INVALID_FIELD;
   3187                goto invalid;
   3188            }
   3189
   3190            if (n->params.zasl &&
   3191                data_size > (uint64_t)n->page_size << n->params.zasl) {
   3192                trace_pci_nvme_err_zasl(data_size);
   3193                return NVME_INVALID_FIELD | NVME_DNR;
   3194            }
   3195
   3196            slba = zone->w_ptr;
   3197            rw->slba = cpu_to_le64(slba);
   3198            res->slba = cpu_to_le64(slba);
   3199
   3200            switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
   3201            case NVME_ID_NS_DPS_TYPE_1:
   3202                if (!piremap) {
   3203                    return NVME_INVALID_PROT_INFO | NVME_DNR;
   3204                }
   3205
   3206                /* fallthrough */
   3207
   3208            case NVME_ID_NS_DPS_TYPE_2:
   3209                if (piremap) {
   3210                    uint32_t reftag = le32_to_cpu(rw->reftag);
   3211                    rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
   3212                }
   3213
   3214                break;
   3215
   3216            case NVME_ID_NS_DPS_TYPE_3:
   3217                if (piremap) {
   3218                    return NVME_INVALID_PROT_INFO | NVME_DNR;
   3219                }
   3220
   3221                break;
   3222            }
   3223        }
   3224
   3225        status = nvme_check_zone_write(ns, zone, slba, nlb);
   3226        if (status) {
   3227            goto invalid;
   3228        }
   3229
   3230        status = nvme_zrm_auto(n, ns, zone);
   3231        if (status) {
   3232            goto invalid;
   3233        }
   3234
   3235        zone->w_ptr += nlb;
   3236    }
   3237
   3238    data_offset = nvme_l2b(ns, slba);
   3239
   3240    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
   3241        return nvme_dif_rw(n, req);
   3242    }
   3243
   3244    if (!wrz) {
   3245        status = nvme_map_data(n, nlb, req);
   3246        if (status) {
   3247            goto invalid;
   3248        }
   3249
   3250        block_acct_start(blk_get_stats(blk), &req->acct, data_size,
   3251                         BLOCK_ACCT_WRITE);
   3252        nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
   3253    } else {
   3254        req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
   3255                                           BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
   3256                                           req);
   3257    }
   3258
   3259    return NVME_NO_COMPLETE;
   3260
   3261invalid:
   3262    block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
   3263    return status | NVME_DNR;
   3264}
   3265
   3266static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
   3267{
   3268    return nvme_do_write(n, req, false, false);
   3269}
   3270
   3271static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
   3272{
   3273    return nvme_do_write(n, req, false, true);
   3274}
   3275
   3276static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
   3277{
   3278    return nvme_do_write(n, req, true, false);
   3279}
   3280
   3281static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
   3282                                            uint64_t *slba, uint32_t *zone_idx)
   3283{
   3284    uint32_t dw10 = le32_to_cpu(c->cdw10);
   3285    uint32_t dw11 = le32_to_cpu(c->cdw11);
   3286
   3287    if (!ns->params.zoned) {
   3288        trace_pci_nvme_err_invalid_opc(c->opcode);
   3289        return NVME_INVALID_OPCODE | NVME_DNR;
   3290    }
   3291
   3292    *slba = ((uint64_t)dw11) << 32 | dw10;
   3293    if (unlikely(*slba >= ns->id_ns.nsze)) {
   3294        trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
   3295        *slba = 0;
   3296        return NVME_LBA_RANGE | NVME_DNR;
   3297    }
   3298
   3299    *zone_idx = nvme_zone_idx(ns, *slba);
   3300    assert(*zone_idx < ns->num_zones);
   3301
   3302    return NVME_SUCCESS;
   3303}
   3304
   3305typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
   3306                                 NvmeRequest *);
   3307
   3308enum NvmeZoneProcessingMask {
   3309    NVME_PROC_CURRENT_ZONE    = 0,
   3310    NVME_PROC_OPENED_ZONES    = 1 << 0,
   3311    NVME_PROC_CLOSED_ZONES    = 1 << 1,
   3312    NVME_PROC_READ_ONLY_ZONES = 1 << 2,
   3313    NVME_PROC_FULL_ZONES      = 1 << 3,
   3314};
   3315
   3316static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
   3317                               NvmeZoneState state, NvmeRequest *req)
   3318{
   3319    return nvme_zrm_open(nvme_ctrl(req), ns, zone);
   3320}
   3321
   3322static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
   3323                                NvmeZoneState state, NvmeRequest *req)
   3324{
   3325    return nvme_zrm_close(ns, zone);
   3326}
   3327
   3328static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
   3329                                 NvmeZoneState state, NvmeRequest *req)
   3330{
   3331    return nvme_zrm_finish(ns, zone);
   3332}
   3333
   3334static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
   3335                                  NvmeZoneState state, NvmeRequest *req)
   3336{
   3337    switch (state) {
   3338    case NVME_ZONE_STATE_READ_ONLY:
   3339        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
   3340        /* fall through */
   3341    case NVME_ZONE_STATE_OFFLINE:
   3342        return NVME_SUCCESS;
   3343    default:
   3344        return NVME_ZONE_INVAL_TRANSITION;
   3345    }
   3346}
   3347
   3348static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
   3349{
   3350    uint16_t status;
   3351    uint8_t state = nvme_get_zone_state(zone);
   3352
   3353    if (state == NVME_ZONE_STATE_EMPTY) {
   3354        status = nvme_aor_check(ns, 1, 0);
   3355        if (status) {
   3356            return status;
   3357        }
   3358        nvme_aor_inc_active(ns);
   3359        zone->d.za |= NVME_ZA_ZD_EXT_VALID;
   3360        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
   3361        return NVME_SUCCESS;
   3362    }
   3363
   3364    return NVME_ZONE_INVAL_TRANSITION;
   3365}
   3366
   3367static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
   3368                                    enum NvmeZoneProcessingMask proc_mask,
   3369                                    op_handler_t op_hndlr, NvmeRequest *req)
   3370{
   3371    uint16_t status = NVME_SUCCESS;
   3372    NvmeZoneState zs = nvme_get_zone_state(zone);
   3373    bool proc_zone;
   3374
   3375    switch (zs) {
   3376    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
   3377    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
   3378        proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
   3379        break;
   3380    case NVME_ZONE_STATE_CLOSED:
   3381        proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
   3382        break;
   3383    case NVME_ZONE_STATE_READ_ONLY:
   3384        proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
   3385        break;
   3386    case NVME_ZONE_STATE_FULL:
   3387        proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
   3388        break;
   3389    default:
   3390        proc_zone = false;
   3391    }
   3392
   3393    if (proc_zone) {
   3394        status = op_hndlr(ns, zone, zs, req);
   3395    }
   3396
   3397    return status;
   3398}
   3399
   3400static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
   3401                                enum NvmeZoneProcessingMask proc_mask,
   3402                                op_handler_t op_hndlr, NvmeRequest *req)
   3403{
   3404    NvmeZone *next;
   3405    uint16_t status = NVME_SUCCESS;
   3406    int i;
   3407
   3408    if (!proc_mask) {
   3409        status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
   3410    } else {
   3411        if (proc_mask & NVME_PROC_CLOSED_ZONES) {
   3412            QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
   3413                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
   3414                                             req);
   3415                if (status && status != NVME_NO_COMPLETE) {
   3416                    goto out;
   3417                }
   3418            }
   3419        }
   3420        if (proc_mask & NVME_PROC_OPENED_ZONES) {
   3421            QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
   3422                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
   3423                                             req);
   3424                if (status && status != NVME_NO_COMPLETE) {
   3425                    goto out;
   3426                }
   3427            }
   3428
   3429            QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
   3430                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
   3431                                             req);
   3432                if (status && status != NVME_NO_COMPLETE) {
   3433                    goto out;
   3434                }
   3435            }
   3436        }
   3437        if (proc_mask & NVME_PROC_FULL_ZONES) {
   3438            QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
   3439                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
   3440                                             req);
   3441                if (status && status != NVME_NO_COMPLETE) {
   3442                    goto out;
   3443                }
   3444            }
   3445        }
   3446
   3447        if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
   3448            for (i = 0; i < ns->num_zones; i++, zone++) {
   3449                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
   3450                                             req);
   3451                if (status && status != NVME_NO_COMPLETE) {
   3452                    goto out;
   3453                }
   3454            }
   3455        }
   3456    }
   3457
   3458out:
   3459    return status;
   3460}
   3461
   3462typedef struct NvmeZoneResetAIOCB {
   3463    BlockAIOCB common;
   3464    BlockAIOCB *aiocb;
   3465    NvmeRequest *req;
   3466    QEMUBH *bh;
   3467    int ret;
   3468
   3469    bool all;
   3470    int idx;
   3471    NvmeZone *zone;
   3472} NvmeZoneResetAIOCB;
   3473
   3474static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
   3475{
   3476    NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
   3477    NvmeRequest *req = iocb->req;
   3478    NvmeNamespace *ns = req->ns;
   3479
   3480    iocb->idx = ns->num_zones;
   3481
   3482    iocb->ret = -ECANCELED;
   3483
   3484    if (iocb->aiocb) {
   3485        blk_aio_cancel_async(iocb->aiocb);
   3486        iocb->aiocb = NULL;
   3487    }
   3488}
   3489
   3490static const AIOCBInfo nvme_zone_reset_aiocb_info = {
   3491    .aiocb_size = sizeof(NvmeZoneResetAIOCB),
   3492    .cancel_async = nvme_zone_reset_cancel,
   3493};
   3494
   3495static void nvme_zone_reset_bh(void *opaque)
   3496{
   3497    NvmeZoneResetAIOCB *iocb = opaque;
   3498
   3499    iocb->common.cb(iocb->common.opaque, iocb->ret);
   3500
   3501    qemu_bh_delete(iocb->bh);
   3502    iocb->bh = NULL;
   3503    qemu_aio_unref(iocb);
   3504}
   3505
   3506static void nvme_zone_reset_cb(void *opaque, int ret);
   3507
   3508static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
   3509{
   3510    NvmeZoneResetAIOCB *iocb = opaque;
   3511    NvmeRequest *req = iocb->req;
   3512    NvmeNamespace *ns = req->ns;
   3513    int64_t moff;
   3514    int count;
   3515
   3516    if (ret < 0) {
   3517        nvme_zone_reset_cb(iocb, ret);
   3518        return;
   3519    }
   3520
   3521    if (!ns->lbaf.ms) {
   3522        nvme_zone_reset_cb(iocb, 0);
   3523        return;
   3524    }
   3525
   3526    moff = nvme_moff(ns, iocb->zone->d.zslba);
   3527    count = nvme_m2b(ns, ns->zone_size);
   3528
   3529    iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
   3530                                        BDRV_REQ_MAY_UNMAP,
   3531                                        nvme_zone_reset_cb, iocb);
   3532    return;
   3533}
   3534
   3535static void nvme_zone_reset_cb(void *opaque, int ret)
   3536{
   3537    NvmeZoneResetAIOCB *iocb = opaque;
   3538    NvmeRequest *req = iocb->req;
   3539    NvmeNamespace *ns = req->ns;
   3540
   3541    if (ret < 0) {
   3542        iocb->ret = ret;
   3543        goto done;
   3544    }
   3545
   3546    if (iocb->zone) {
   3547        nvme_zrm_reset(ns, iocb->zone);
   3548
   3549        if (!iocb->all) {
   3550            goto done;
   3551        }
   3552    }
   3553
   3554    while (iocb->idx < ns->num_zones) {
   3555        NvmeZone *zone = &ns->zone_array[iocb->idx++];
   3556
   3557        switch (nvme_get_zone_state(zone)) {
   3558        case NVME_ZONE_STATE_EMPTY:
   3559            if (!iocb->all) {
   3560                goto done;
   3561            }
   3562
   3563            continue;
   3564
   3565        case NVME_ZONE_STATE_EXPLICITLY_OPEN:
   3566        case NVME_ZONE_STATE_IMPLICITLY_OPEN:
   3567        case NVME_ZONE_STATE_CLOSED:
   3568        case NVME_ZONE_STATE_FULL:
   3569            iocb->zone = zone;
   3570            break;
   3571
   3572        default:
   3573            continue;
   3574        }
   3575
   3576        trace_pci_nvme_zns_zone_reset(zone->d.zslba);
   3577
   3578        iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
   3579                                            nvme_l2b(ns, zone->d.zslba),
   3580                                            nvme_l2b(ns, ns->zone_size),
   3581                                            BDRV_REQ_MAY_UNMAP,
   3582                                            nvme_zone_reset_epilogue_cb,
   3583                                            iocb);
   3584        return;
   3585    }
   3586
   3587done:
   3588    iocb->aiocb = NULL;
   3589    if (iocb->bh) {
   3590        qemu_bh_schedule(iocb->bh);
   3591    }
   3592}
   3593
   3594static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
   3595{
   3596    NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
   3597    NvmeNamespace *ns = req->ns;
   3598    NvmeZone *zone;
   3599    NvmeZoneResetAIOCB *iocb;
   3600    uint8_t *zd_ext;
   3601    uint32_t dw13 = le32_to_cpu(cmd->cdw13);
   3602    uint64_t slba = 0;
   3603    uint32_t zone_idx = 0;
   3604    uint16_t status;
   3605    uint8_t action;
   3606    bool all;
   3607    enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
   3608
   3609    action = dw13 & 0xff;
   3610    all = !!(dw13 & 0x100);
   3611
   3612    req->status = NVME_SUCCESS;
   3613
   3614    if (!all) {
   3615        status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
   3616        if (status) {
   3617            return status;
   3618        }
   3619    }
   3620
   3621    zone = &ns->zone_array[zone_idx];
   3622    if (slba != zone->d.zslba) {
   3623        trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
   3624        return NVME_INVALID_FIELD | NVME_DNR;
   3625    }
   3626
   3627    switch (action) {
   3628
   3629    case NVME_ZONE_ACTION_OPEN:
   3630        if (all) {
   3631            proc_mask = NVME_PROC_CLOSED_ZONES;
   3632        }
   3633        trace_pci_nvme_open_zone(slba, zone_idx, all);
   3634        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
   3635        break;
   3636
   3637    case NVME_ZONE_ACTION_CLOSE:
   3638        if (all) {
   3639            proc_mask = NVME_PROC_OPENED_ZONES;
   3640        }
   3641        trace_pci_nvme_close_zone(slba, zone_idx, all);
   3642        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
   3643        break;
   3644
   3645    case NVME_ZONE_ACTION_FINISH:
   3646        if (all) {
   3647            proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
   3648        }
   3649        trace_pci_nvme_finish_zone(slba, zone_idx, all);
   3650        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
   3651        break;
   3652
   3653    case NVME_ZONE_ACTION_RESET:
   3654        trace_pci_nvme_reset_zone(slba, zone_idx, all);
   3655
   3656        iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
   3657                           nvme_misc_cb, req);
   3658
   3659        iocb->req = req;
   3660        iocb->bh = qemu_bh_new(nvme_zone_reset_bh, iocb);
   3661        iocb->ret = 0;
   3662        iocb->all = all;
   3663        iocb->idx = zone_idx;
   3664        iocb->zone = NULL;
   3665
   3666        req->aiocb = &iocb->common;
   3667        nvme_zone_reset_cb(iocb, 0);
   3668
   3669        return NVME_NO_COMPLETE;
   3670
   3671    case NVME_ZONE_ACTION_OFFLINE:
   3672        if (all) {
   3673            proc_mask = NVME_PROC_READ_ONLY_ZONES;
   3674        }
   3675        trace_pci_nvme_offline_zone(slba, zone_idx, all);
   3676        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
   3677        break;
   3678
   3679    case NVME_ZONE_ACTION_SET_ZD_EXT:
   3680        trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
   3681        if (all || !ns->params.zd_extension_size) {
   3682            return NVME_INVALID_FIELD | NVME_DNR;
   3683        }
   3684        zd_ext = nvme_get_zd_extension(ns, zone_idx);
   3685        status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
   3686        if (status) {
   3687            trace_pci_nvme_err_zd_extension_map_error(zone_idx);
   3688            return status;
   3689        }
   3690
   3691        status = nvme_set_zd_ext(ns, zone);
   3692        if (status == NVME_SUCCESS) {
   3693            trace_pci_nvme_zd_extension_set(zone_idx);
   3694            return status;
   3695        }
   3696        break;
   3697
   3698    default:
   3699        trace_pci_nvme_err_invalid_mgmt_action(action);
   3700        status = NVME_INVALID_FIELD;
   3701    }
   3702
   3703    if (status == NVME_ZONE_INVAL_TRANSITION) {
   3704        trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
   3705                                                         zone->d.za);
   3706    }
   3707    if (status) {
   3708        status |= NVME_DNR;
   3709    }
   3710
   3711    return status;
   3712}
   3713
   3714static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
   3715{
   3716    NvmeZoneState zs = nvme_get_zone_state(zl);
   3717
   3718    switch (zafs) {
   3719    case NVME_ZONE_REPORT_ALL:
   3720        return true;
   3721    case NVME_ZONE_REPORT_EMPTY:
   3722        return zs == NVME_ZONE_STATE_EMPTY;
   3723    case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
   3724        return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
   3725    case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
   3726        return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
   3727    case NVME_ZONE_REPORT_CLOSED:
   3728        return zs == NVME_ZONE_STATE_CLOSED;
   3729    case NVME_ZONE_REPORT_FULL:
   3730        return zs == NVME_ZONE_STATE_FULL;
   3731    case NVME_ZONE_REPORT_READ_ONLY:
   3732        return zs == NVME_ZONE_STATE_READ_ONLY;
   3733    case NVME_ZONE_REPORT_OFFLINE:
   3734        return zs == NVME_ZONE_STATE_OFFLINE;
   3735    default:
   3736        return false;
   3737    }
   3738}
   3739
   3740static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
   3741{
   3742    NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
   3743    NvmeNamespace *ns = req->ns;
   3744    /* cdw12 is zero-based number of dwords to return. Convert to bytes */
   3745    uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
   3746    uint32_t dw13 = le32_to_cpu(cmd->cdw13);
   3747    uint32_t zone_idx, zra, zrasf, partial;
   3748    uint64_t max_zones, nr_zones = 0;
   3749    uint16_t status;
   3750    uint64_t slba;
   3751    NvmeZoneDescr *z;
   3752    NvmeZone *zone;
   3753    NvmeZoneReportHeader *header;
   3754    void *buf, *buf_p;
   3755    size_t zone_entry_sz;
   3756    int i;
   3757
   3758    req->status = NVME_SUCCESS;
   3759
   3760    status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
   3761    if (status) {
   3762        return status;
   3763    }
   3764
   3765    zra = dw13 & 0xff;
   3766    if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
   3767        return NVME_INVALID_FIELD | NVME_DNR;
   3768    }
   3769    if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
   3770        return NVME_INVALID_FIELD | NVME_DNR;
   3771    }
   3772
   3773    zrasf = (dw13 >> 8) & 0xff;
   3774    if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
   3775        return NVME_INVALID_FIELD | NVME_DNR;
   3776    }
   3777
   3778    if (data_size < sizeof(NvmeZoneReportHeader)) {
   3779        return NVME_INVALID_FIELD | NVME_DNR;
   3780    }
   3781
   3782    status = nvme_check_mdts(n, data_size);
   3783    if (status) {
   3784        return status;
   3785    }
   3786
   3787    partial = (dw13 >> 16) & 0x01;
   3788
   3789    zone_entry_sz = sizeof(NvmeZoneDescr);
   3790    if (zra == NVME_ZONE_REPORT_EXTENDED) {
   3791        zone_entry_sz += ns->params.zd_extension_size;
   3792    }
   3793
   3794    max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
   3795    buf = g_malloc0(data_size);
   3796
   3797    zone = &ns->zone_array[zone_idx];
   3798    for (i = zone_idx; i < ns->num_zones; i++) {
   3799        if (partial && nr_zones >= max_zones) {
   3800            break;
   3801        }
   3802        if (nvme_zone_matches_filter(zrasf, zone++)) {
   3803            nr_zones++;
   3804        }
   3805    }
   3806    header = (NvmeZoneReportHeader *)buf;
   3807    header->nr_zones = cpu_to_le64(nr_zones);
   3808
   3809    buf_p = buf + sizeof(NvmeZoneReportHeader);
   3810    for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
   3811        zone = &ns->zone_array[zone_idx];
   3812        if (nvme_zone_matches_filter(zrasf, zone)) {
   3813            z = (NvmeZoneDescr *)buf_p;
   3814            buf_p += sizeof(NvmeZoneDescr);
   3815
   3816            z->zt = zone->d.zt;
   3817            z->zs = zone->d.zs;
   3818            z->zcap = cpu_to_le64(zone->d.zcap);
   3819            z->zslba = cpu_to_le64(zone->d.zslba);
   3820            z->za = zone->d.za;
   3821
   3822            if (nvme_wp_is_valid(zone)) {
   3823                z->wp = cpu_to_le64(zone->d.wp);
   3824            } else {
   3825                z->wp = cpu_to_le64(~0ULL);
   3826            }
   3827
   3828            if (zra == NVME_ZONE_REPORT_EXTENDED) {
   3829                if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
   3830                    memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
   3831                           ns->params.zd_extension_size);
   3832                }
   3833                buf_p += ns->params.zd_extension_size;
   3834            }
   3835
   3836            max_zones--;
   3837        }
   3838    }
   3839
   3840    status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
   3841
   3842    g_free(buf);
   3843
   3844    return status;
   3845}
   3846
   3847static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
   3848{
   3849    NvmeNamespace *ns;
   3850    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
   3851
   3852    trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
   3853                          req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
   3854
   3855    if (!nvme_nsid_valid(n, nsid)) {
   3856        return NVME_INVALID_NSID | NVME_DNR;
   3857    }
   3858
   3859    /*
   3860     * In the base NVM command set, Flush may apply to all namespaces
   3861     * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
   3862     * along with TP 4056 (Namespace Types), it may be pretty screwed up.
   3863     *
   3864     * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
   3865     * opcode with a specific command since we cannot determine a unique I/O
   3866     * command set. Opcode 0h could have any other meaning than something
   3867     * equivalent to flushing and say it DOES have completely different
   3868     * semantics in some other command set - does an NSID of FFFFFFFFh then
   3869     * mean "for all namespaces, apply whatever command set specific command
   3870     * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
   3871     * whatever command that uses the 0h opcode if, and only if, it allows NSID
   3872     * to be FFFFFFFFh"?
   3873     *
   3874     * Anyway (and luckily), for now, we do not care about this since the
   3875     * device only supports namespace types that includes the NVM Flush command
   3876     * (NVM and Zoned), so always do an NVM Flush.
   3877     */
   3878    if (req->cmd.opcode == NVME_CMD_FLUSH) {
   3879        return nvme_flush(n, req);
   3880    }
   3881
   3882    ns = nvme_ns(n, nsid);
   3883    if (unlikely(!ns)) {
   3884        return NVME_INVALID_FIELD | NVME_DNR;
   3885    }
   3886
   3887    if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
   3888        trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
   3889        return NVME_INVALID_OPCODE | NVME_DNR;
   3890    }
   3891
   3892    if (ns->status) {
   3893        return ns->status;
   3894    }
   3895
   3896    if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
   3897        return NVME_INVALID_FIELD;
   3898    }
   3899
   3900    req->ns = ns;
   3901
   3902    switch (req->cmd.opcode) {
   3903    case NVME_CMD_WRITE_ZEROES:
   3904        return nvme_write_zeroes(n, req);
   3905    case NVME_CMD_ZONE_APPEND:
   3906        return nvme_zone_append(n, req);
   3907    case NVME_CMD_WRITE:
   3908        return nvme_write(n, req);
   3909    case NVME_CMD_READ:
   3910        return nvme_read(n, req);
   3911    case NVME_CMD_COMPARE:
   3912        return nvme_compare(n, req);
   3913    case NVME_CMD_DSM:
   3914        return nvme_dsm(n, req);
   3915    case NVME_CMD_VERIFY:
   3916        return nvme_verify(n, req);
   3917    case NVME_CMD_COPY:
   3918        return nvme_copy(n, req);
   3919    case NVME_CMD_ZONE_MGMT_SEND:
   3920        return nvme_zone_mgmt_send(n, req);
   3921    case NVME_CMD_ZONE_MGMT_RECV:
   3922        return nvme_zone_mgmt_recv(n, req);
   3923    default:
   3924        assert(false);
   3925    }
   3926
   3927    return NVME_INVALID_OPCODE | NVME_DNR;
   3928}
   3929
   3930static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
   3931{
   3932    n->sq[sq->sqid] = NULL;
   3933    timer_free(sq->timer);
   3934    g_free(sq->io_req);
   3935    if (sq->sqid) {
   3936        g_free(sq);
   3937    }
   3938}
   3939
   3940static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
   3941{
   3942    NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
   3943    NvmeRequest *r, *next;
   3944    NvmeSQueue *sq;
   3945    NvmeCQueue *cq;
   3946    uint16_t qid = le16_to_cpu(c->qid);
   3947
   3948    if (unlikely(!qid || nvme_check_sqid(n, qid))) {
   3949        trace_pci_nvme_err_invalid_del_sq(qid);
   3950        return NVME_INVALID_QID | NVME_DNR;
   3951    }
   3952
   3953    trace_pci_nvme_del_sq(qid);
   3954
   3955    sq = n->sq[qid];
   3956    while (!QTAILQ_EMPTY(&sq->out_req_list)) {
   3957        r = QTAILQ_FIRST(&sq->out_req_list);
   3958        assert(r->aiocb);
   3959        blk_aio_cancel(r->aiocb);
   3960    }
   3961
   3962    assert(QTAILQ_EMPTY(&sq->out_req_list));
   3963
   3964    if (!nvme_check_cqid(n, sq->cqid)) {
   3965        cq = n->cq[sq->cqid];
   3966        QTAILQ_REMOVE(&cq->sq_list, sq, entry);
   3967
   3968        nvme_post_cqes(cq);
   3969        QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
   3970            if (r->sq == sq) {
   3971                QTAILQ_REMOVE(&cq->req_list, r, entry);
   3972                QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
   3973            }
   3974        }
   3975    }
   3976
   3977    nvme_free_sq(sq, n);
   3978    return NVME_SUCCESS;
   3979}
   3980
   3981static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
   3982                         uint16_t sqid, uint16_t cqid, uint16_t size)
   3983{
   3984    int i;
   3985    NvmeCQueue *cq;
   3986
   3987    sq->ctrl = n;
   3988    sq->dma_addr = dma_addr;
   3989    sq->sqid = sqid;
   3990    sq->size = size;
   3991    sq->cqid = cqid;
   3992    sq->head = sq->tail = 0;
   3993    sq->io_req = g_new0(NvmeRequest, sq->size);
   3994
   3995    QTAILQ_INIT(&sq->req_list);
   3996    QTAILQ_INIT(&sq->out_req_list);
   3997    for (i = 0; i < sq->size; i++) {
   3998        sq->io_req[i].sq = sq;
   3999        QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
   4000    }
   4001    sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
   4002
   4003    assert(n->cq[cqid]);
   4004    cq = n->cq[cqid];
   4005    QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
   4006    n->sq[sqid] = sq;
   4007}
   4008
   4009static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
   4010{
   4011    NvmeSQueue *sq;
   4012    NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
   4013
   4014    uint16_t cqid = le16_to_cpu(c->cqid);
   4015    uint16_t sqid = le16_to_cpu(c->sqid);
   4016    uint16_t qsize = le16_to_cpu(c->qsize);
   4017    uint16_t qflags = le16_to_cpu(c->sq_flags);
   4018    uint64_t prp1 = le64_to_cpu(c->prp1);
   4019
   4020    trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
   4021
   4022    if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
   4023        trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
   4024        return NVME_INVALID_CQID | NVME_DNR;
   4025    }
   4026    if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
   4027        n->sq[sqid] != NULL)) {
   4028        trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
   4029        return NVME_INVALID_QID | NVME_DNR;
   4030    }
   4031    if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
   4032        trace_pci_nvme_err_invalid_create_sq_size(qsize);
   4033        return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
   4034    }
   4035    if (unlikely(prp1 & (n->page_size - 1))) {
   4036        trace_pci_nvme_err_invalid_create_sq_addr(prp1);
   4037        return NVME_INVALID_PRP_OFFSET | NVME_DNR;
   4038    }
   4039    if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
   4040        trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
   4041        return NVME_INVALID_FIELD | NVME_DNR;
   4042    }
   4043    sq = g_malloc0(sizeof(*sq));
   4044    nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
   4045    return NVME_SUCCESS;
   4046}
   4047
   4048struct nvme_stats {
   4049    uint64_t units_read;
   4050    uint64_t units_written;
   4051    uint64_t read_commands;
   4052    uint64_t write_commands;
   4053};
   4054
   4055static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
   4056{
   4057    BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
   4058
   4059    stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
   4060    stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
   4061    stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
   4062    stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
   4063}
   4064
   4065static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
   4066                                uint64_t off, NvmeRequest *req)
   4067{
   4068    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
   4069    struct nvme_stats stats = { 0 };
   4070    NvmeSmartLog smart = { 0 };
   4071    uint32_t trans_len;
   4072    NvmeNamespace *ns;
   4073    time_t current_ms;
   4074
   4075    if (off >= sizeof(smart)) {
   4076        return NVME_INVALID_FIELD | NVME_DNR;
   4077    }
   4078
   4079    if (nsid != 0xffffffff) {
   4080        ns = nvme_ns(n, nsid);
   4081        if (!ns) {
   4082            return NVME_INVALID_NSID | NVME_DNR;
   4083        }
   4084        nvme_set_blk_stats(ns, &stats);
   4085    } else {
   4086        int i;
   4087
   4088        for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
   4089            ns = nvme_ns(n, i);
   4090            if (!ns) {
   4091                continue;
   4092            }
   4093            nvme_set_blk_stats(ns, &stats);
   4094        }
   4095    }
   4096
   4097    trans_len = MIN(sizeof(smart) - off, buf_len);
   4098    smart.critical_warning = n->smart_critical_warning;
   4099
   4100    smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
   4101                                                        1000));
   4102    smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
   4103                                                           1000));
   4104    smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
   4105    smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
   4106
   4107    smart.temperature = cpu_to_le16(n->temperature);
   4108
   4109    if ((n->temperature >= n->features.temp_thresh_hi) ||
   4110        (n->temperature <= n->features.temp_thresh_low)) {
   4111        smart.critical_warning |= NVME_SMART_TEMPERATURE;
   4112    }
   4113
   4114    current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
   4115    smart.power_on_hours[0] =
   4116        cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
   4117
   4118    if (!rae) {
   4119        nvme_clear_events(n, NVME_AER_TYPE_SMART);
   4120    }
   4121
   4122    return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
   4123}
   4124
   4125static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
   4126                                 NvmeRequest *req)
   4127{
   4128    uint32_t trans_len;
   4129    NvmeFwSlotInfoLog fw_log = {
   4130        .afi = 0x1,
   4131    };
   4132
   4133    if (off >= sizeof(fw_log)) {
   4134        return NVME_INVALID_FIELD | NVME_DNR;
   4135    }
   4136
   4137    strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
   4138    trans_len = MIN(sizeof(fw_log) - off, buf_len);
   4139
   4140    return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
   4141}
   4142
   4143static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
   4144                                uint64_t off, NvmeRequest *req)
   4145{
   4146    uint32_t trans_len;
   4147    NvmeErrorLog errlog;
   4148
   4149    if (off >= sizeof(errlog)) {
   4150        return NVME_INVALID_FIELD | NVME_DNR;
   4151    }
   4152
   4153    if (!rae) {
   4154        nvme_clear_events(n, NVME_AER_TYPE_ERROR);
   4155    }
   4156
   4157    memset(&errlog, 0x0, sizeof(errlog));
   4158    trans_len = MIN(sizeof(errlog) - off, buf_len);
   4159
   4160    return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
   4161}
   4162
   4163static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
   4164                                    uint64_t off, NvmeRequest *req)
   4165{
   4166    uint32_t nslist[1024];
   4167    uint32_t trans_len;
   4168    int i = 0;
   4169    uint32_t nsid;
   4170
   4171    memset(nslist, 0x0, sizeof(nslist));
   4172    trans_len = MIN(sizeof(nslist) - off, buf_len);
   4173
   4174    while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
   4175            NVME_CHANGED_NSID_SIZE) {
   4176        /*
   4177         * If more than 1024 namespaces, the first entry in the log page should
   4178         * be set to FFFFFFFFh and the others to 0 as spec.
   4179         */
   4180        if (i == ARRAY_SIZE(nslist)) {
   4181            memset(nslist, 0x0, sizeof(nslist));
   4182            nslist[0] = 0xffffffff;
   4183            break;
   4184        }
   4185
   4186        nslist[i++] = nsid;
   4187        clear_bit(nsid, n->changed_nsids);
   4188    }
   4189
   4190    /*
   4191     * Remove all the remaining list entries in case returns directly due to
   4192     * more than 1024 namespaces.
   4193     */
   4194    if (nslist[0] == 0xffffffff) {
   4195        bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
   4196    }
   4197
   4198    if (!rae) {
   4199        nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
   4200    }
   4201
   4202    return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
   4203}
   4204
   4205static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
   4206                                 uint64_t off, NvmeRequest *req)
   4207{
   4208    NvmeEffectsLog log = {};
   4209    const uint32_t *src_iocs = NULL;
   4210    uint32_t trans_len;
   4211
   4212    if (off >= sizeof(log)) {
   4213        trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
   4214        return NVME_INVALID_FIELD | NVME_DNR;
   4215    }
   4216
   4217    switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
   4218    case NVME_CC_CSS_NVM:
   4219        src_iocs = nvme_cse_iocs_nvm;
   4220        /* fall through */
   4221    case NVME_CC_CSS_ADMIN_ONLY:
   4222        break;
   4223    case NVME_CC_CSS_CSI:
   4224        switch (csi) {
   4225        case NVME_CSI_NVM:
   4226            src_iocs = nvme_cse_iocs_nvm;
   4227            break;
   4228        case NVME_CSI_ZONED:
   4229            src_iocs = nvme_cse_iocs_zoned;
   4230            break;
   4231        }
   4232    }
   4233
   4234    memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
   4235
   4236    if (src_iocs) {
   4237        memcpy(log.iocs, src_iocs, sizeof(log.iocs));
   4238    }
   4239
   4240    trans_len = MIN(sizeof(log) - off, buf_len);
   4241
   4242    return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
   4243}
   4244
   4245static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
   4246{
   4247    NvmeCmd *cmd = &req->cmd;
   4248
   4249    uint32_t dw10 = le32_to_cpu(cmd->cdw10);
   4250    uint32_t dw11 = le32_to_cpu(cmd->cdw11);
   4251    uint32_t dw12 = le32_to_cpu(cmd->cdw12);
   4252    uint32_t dw13 = le32_to_cpu(cmd->cdw13);
   4253    uint8_t  lid = dw10 & 0xff;
   4254    uint8_t  lsp = (dw10 >> 8) & 0xf;
   4255    uint8_t  rae = (dw10 >> 15) & 0x1;
   4256    uint8_t  csi = le32_to_cpu(cmd->cdw14) >> 24;
   4257    uint32_t numdl, numdu;
   4258    uint64_t off, lpol, lpou;
   4259    size_t   len;
   4260    uint16_t status;
   4261
   4262    numdl = (dw10 >> 16);
   4263    numdu = (dw11 & 0xffff);
   4264    lpol = dw12;
   4265    lpou = dw13;
   4266
   4267    len = (((numdu << 16) | numdl) + 1) << 2;
   4268    off = (lpou << 32ULL) | lpol;
   4269
   4270    if (off & 0x3) {
   4271        return NVME_INVALID_FIELD | NVME_DNR;
   4272    }
   4273
   4274    trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
   4275
   4276    status = nvme_check_mdts(n, len);
   4277    if (status) {
   4278        return status;
   4279    }
   4280
   4281    switch (lid) {
   4282    case NVME_LOG_ERROR_INFO:
   4283        return nvme_error_info(n, rae, len, off, req);
   4284    case NVME_LOG_SMART_INFO:
   4285        return nvme_smart_info(n, rae, len, off, req);
   4286    case NVME_LOG_FW_SLOT_INFO:
   4287        return nvme_fw_log_info(n, len, off, req);
   4288    case NVME_LOG_CHANGED_NSLIST:
   4289        return nvme_changed_nslist(n, rae, len, off, req);
   4290    case NVME_LOG_CMD_EFFECTS:
   4291        return nvme_cmd_effects(n, csi, len, off, req);
   4292    default:
   4293        trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
   4294        return NVME_INVALID_FIELD | NVME_DNR;
   4295    }
   4296}
   4297
   4298static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
   4299{
   4300    n->cq[cq->cqid] = NULL;
   4301    timer_free(cq->timer);
   4302    if (msix_enabled(&n->parent_obj)) {
   4303        msix_vector_unuse(&n->parent_obj, cq->vector);
   4304    }
   4305    if (cq->cqid) {
   4306        g_free(cq);
   4307    }
   4308}
   4309
   4310static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
   4311{
   4312    NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
   4313    NvmeCQueue *cq;
   4314    uint16_t qid = le16_to_cpu(c->qid);
   4315
   4316    if (unlikely(!qid || nvme_check_cqid(n, qid))) {
   4317        trace_pci_nvme_err_invalid_del_cq_cqid(qid);
   4318        return NVME_INVALID_CQID | NVME_DNR;
   4319    }
   4320
   4321    cq = n->cq[qid];
   4322    if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
   4323        trace_pci_nvme_err_invalid_del_cq_notempty(qid);
   4324        return NVME_INVALID_QUEUE_DEL;
   4325    }
   4326
   4327    if (cq->irq_enabled && cq->tail != cq->head) {
   4328        n->cq_pending--;
   4329    }
   4330
   4331    nvme_irq_deassert(n, cq);
   4332    trace_pci_nvme_del_cq(qid);
   4333    nvme_free_cq(cq, n);
   4334    return NVME_SUCCESS;
   4335}
   4336
   4337static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
   4338                         uint16_t cqid, uint16_t vector, uint16_t size,
   4339                         uint16_t irq_enabled)
   4340{
   4341    int ret;
   4342
   4343    if (msix_enabled(&n->parent_obj)) {
   4344        ret = msix_vector_use(&n->parent_obj, vector);
   4345        assert(ret == 0);
   4346    }
   4347    cq->ctrl = n;
   4348    cq->cqid = cqid;
   4349    cq->size = size;
   4350    cq->dma_addr = dma_addr;
   4351    cq->phase = 1;
   4352    cq->irq_enabled = irq_enabled;
   4353    cq->vector = vector;
   4354    cq->head = cq->tail = 0;
   4355    QTAILQ_INIT(&cq->req_list);
   4356    QTAILQ_INIT(&cq->sq_list);
   4357    n->cq[cqid] = cq;
   4358    cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
   4359}
   4360
   4361static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
   4362{
   4363    NvmeCQueue *cq;
   4364    NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
   4365    uint16_t cqid = le16_to_cpu(c->cqid);
   4366    uint16_t vector = le16_to_cpu(c->irq_vector);
   4367    uint16_t qsize = le16_to_cpu(c->qsize);
   4368    uint16_t qflags = le16_to_cpu(c->cq_flags);
   4369    uint64_t prp1 = le64_to_cpu(c->prp1);
   4370
   4371    trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
   4372                             NVME_CQ_FLAGS_IEN(qflags) != 0);
   4373
   4374    if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
   4375        n->cq[cqid] != NULL)) {
   4376        trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
   4377        return NVME_INVALID_QID | NVME_DNR;
   4378    }
   4379    if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
   4380        trace_pci_nvme_err_invalid_create_cq_size(qsize);
   4381        return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
   4382    }
   4383    if (unlikely(prp1 & (n->page_size - 1))) {
   4384        trace_pci_nvme_err_invalid_create_cq_addr(prp1);
   4385        return NVME_INVALID_PRP_OFFSET | NVME_DNR;
   4386    }
   4387    if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
   4388        trace_pci_nvme_err_invalid_create_cq_vector(vector);
   4389        return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
   4390    }
   4391    if (unlikely(vector >= n->params.msix_qsize)) {
   4392        trace_pci_nvme_err_invalid_create_cq_vector(vector);
   4393        return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
   4394    }
   4395    if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
   4396        trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
   4397        return NVME_INVALID_FIELD | NVME_DNR;
   4398    }
   4399
   4400    cq = g_malloc0(sizeof(*cq));
   4401    nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
   4402                 NVME_CQ_FLAGS_IEN(qflags));
   4403
   4404    /*
   4405     * It is only required to set qs_created when creating a completion queue;
   4406     * creating a submission queue without a matching completion queue will
   4407     * fail.
   4408     */
   4409    n->qs_created = true;
   4410    return NVME_SUCCESS;
   4411}
   4412
   4413static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
   4414{
   4415    uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
   4416
   4417    return nvme_c2h(n, id, sizeof(id), req);
   4418}
   4419
   4420static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
   4421{
   4422    trace_pci_nvme_identify_ctrl();
   4423
   4424    return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
   4425}
   4426
   4427static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
   4428{
   4429    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
   4430    uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
   4431    NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
   4432
   4433    trace_pci_nvme_identify_ctrl_csi(c->csi);
   4434
   4435    switch (c->csi) {
   4436    case NVME_CSI_NVM:
   4437        id_nvm->vsl = n->params.vsl;
   4438        id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
   4439        break;
   4440
   4441    case NVME_CSI_ZONED:
   4442        ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
   4443        break;
   4444
   4445    default:
   4446        return NVME_INVALID_FIELD | NVME_DNR;
   4447    }
   4448
   4449    return nvme_c2h(n, id, sizeof(id), req);
   4450}
   4451
   4452static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
   4453{
   4454    NvmeNamespace *ns;
   4455    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
   4456    uint32_t nsid = le32_to_cpu(c->nsid);
   4457
   4458    trace_pci_nvme_identify_ns(nsid);
   4459
   4460    if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
   4461        return NVME_INVALID_NSID | NVME_DNR;
   4462    }
   4463
   4464    ns = nvme_ns(n, nsid);
   4465    if (unlikely(!ns)) {
   4466        if (!active) {
   4467            ns = nvme_subsys_ns(n->subsys, nsid);
   4468            if (!ns) {
   4469                return nvme_rpt_empty_id_struct(n, req);
   4470            }
   4471        } else {
   4472            return nvme_rpt_empty_id_struct(n, req);
   4473        }
   4474    }
   4475
   4476    if (active || ns->csi == NVME_CSI_NVM) {
   4477        return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
   4478    }
   4479
   4480    return NVME_INVALID_CMD_SET | NVME_DNR;
   4481}
   4482
   4483static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
   4484                                        bool attached)
   4485{
   4486    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
   4487    uint32_t nsid = le32_to_cpu(c->nsid);
   4488    uint16_t min_id = le16_to_cpu(c->ctrlid);
   4489    uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
   4490    uint16_t *ids = &list[1];
   4491    NvmeNamespace *ns;
   4492    NvmeCtrl *ctrl;
   4493    int cntlid, nr_ids = 0;
   4494
   4495    trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
   4496
   4497    if (!n->subsys) {
   4498        return NVME_INVALID_FIELD | NVME_DNR;
   4499    }
   4500
   4501    if (attached) {
   4502        if (nsid == NVME_NSID_BROADCAST) {
   4503            return NVME_INVALID_FIELD | NVME_DNR;
   4504        }
   4505
   4506        ns = nvme_subsys_ns(n->subsys, nsid);
   4507        if (!ns) {
   4508            return NVME_INVALID_FIELD | NVME_DNR;
   4509        }
   4510    }
   4511
   4512    for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
   4513        ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
   4514        if (!ctrl) {
   4515            continue;
   4516        }
   4517
   4518        if (attached && !nvme_ns(ctrl, nsid)) {
   4519            continue;
   4520        }
   4521
   4522        ids[nr_ids++] = cntlid;
   4523    }
   4524
   4525    list[0] = nr_ids;
   4526
   4527    return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
   4528}
   4529
   4530static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
   4531                                     bool active)
   4532{
   4533    NvmeNamespace *ns;
   4534    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
   4535    uint32_t nsid = le32_to_cpu(c->nsid);
   4536
   4537    trace_pci_nvme_identify_ns_csi(nsid, c->csi);
   4538
   4539    if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
   4540        return NVME_INVALID_NSID | NVME_DNR;
   4541    }
   4542
   4543    ns = nvme_ns(n, nsid);
   4544    if (unlikely(!ns)) {
   4545        if (!active) {
   4546            ns = nvme_subsys_ns(n->subsys, nsid);
   4547            if (!ns) {
   4548                return nvme_rpt_empty_id_struct(n, req);
   4549            }
   4550        } else {
   4551            return nvme_rpt_empty_id_struct(n, req);
   4552        }
   4553    }
   4554
   4555    if (c->csi == NVME_CSI_NVM) {
   4556        return nvme_rpt_empty_id_struct(n, req);
   4557    } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
   4558        return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
   4559                        req);
   4560    }
   4561
   4562    return NVME_INVALID_FIELD | NVME_DNR;
   4563}
   4564
   4565static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
   4566                                     bool active)
   4567{
   4568    NvmeNamespace *ns;
   4569    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
   4570    uint32_t min_nsid = le32_to_cpu(c->nsid);
   4571    uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
   4572    static const int data_len = sizeof(list);
   4573    uint32_t *list_ptr = (uint32_t *)list;
   4574    int i, j = 0;
   4575
   4576    trace_pci_nvme_identify_nslist(min_nsid);
   4577
   4578    /*
   4579     * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
   4580     * since the Active Namespace ID List should return namespaces with ids
   4581     * *higher* than the NSID specified in the command. This is also specified
   4582     * in the spec (NVM Express v1.3d, Section 5.15.4).
   4583     */
   4584    if (min_nsid >= NVME_NSID_BROADCAST - 1) {
   4585        return NVME_INVALID_NSID | NVME_DNR;
   4586    }
   4587
   4588    for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
   4589        ns = nvme_ns(n, i);
   4590        if (!ns) {
   4591            if (!active) {
   4592                ns = nvme_subsys_ns(n->subsys, i);
   4593                if (!ns) {
   4594                    continue;
   4595                }
   4596            } else {
   4597                continue;
   4598            }
   4599        }
   4600        if (ns->params.nsid <= min_nsid) {
   4601            continue;
   4602        }
   4603        list_ptr[j++] = cpu_to_le32(ns->params.nsid);
   4604        if (j == data_len / sizeof(uint32_t)) {
   4605            break;
   4606        }
   4607    }
   4608
   4609    return nvme_c2h(n, list, data_len, req);
   4610}
   4611
   4612static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
   4613                                         bool active)
   4614{
   4615    NvmeNamespace *ns;
   4616    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
   4617    uint32_t min_nsid = le32_to_cpu(c->nsid);
   4618    uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
   4619    static const int data_len = sizeof(list);
   4620    uint32_t *list_ptr = (uint32_t *)list;
   4621    int i, j = 0;
   4622
   4623    trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
   4624
   4625    /*
   4626     * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
   4627     */
   4628    if (min_nsid >= NVME_NSID_BROADCAST - 1) {
   4629        return NVME_INVALID_NSID | NVME_DNR;
   4630    }
   4631
   4632    if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
   4633        return NVME_INVALID_FIELD | NVME_DNR;
   4634    }
   4635
   4636    for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
   4637        ns = nvme_ns(n, i);
   4638        if (!ns) {
   4639            if (!active) {
   4640                ns = nvme_subsys_ns(n->subsys, i);
   4641                if (!ns) {
   4642                    continue;
   4643                }
   4644            } else {
   4645                continue;
   4646            }
   4647        }
   4648        if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
   4649            continue;
   4650        }
   4651        list_ptr[j++] = cpu_to_le32(ns->params.nsid);
   4652        if (j == data_len / sizeof(uint32_t)) {
   4653            break;
   4654        }
   4655    }
   4656
   4657    return nvme_c2h(n, list, data_len, req);
   4658}
   4659
   4660static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
   4661{
   4662    NvmeNamespace *ns;
   4663    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
   4664    uint32_t nsid = le32_to_cpu(c->nsid);
   4665    uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
   4666    uint8_t *pos = list;
   4667    struct {
   4668        NvmeIdNsDescr hdr;
   4669        uint8_t v[NVME_NIDL_UUID];
   4670    } QEMU_PACKED uuid = {};
   4671    struct {
   4672        NvmeIdNsDescr hdr;
   4673        uint64_t v;
   4674    } QEMU_PACKED eui64 = {};
   4675    struct {
   4676        NvmeIdNsDescr hdr;
   4677        uint8_t v;
   4678    } QEMU_PACKED csi = {};
   4679
   4680    trace_pci_nvme_identify_ns_descr_list(nsid);
   4681
   4682    if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
   4683        return NVME_INVALID_NSID | NVME_DNR;
   4684    }
   4685
   4686    ns = nvme_ns(n, nsid);
   4687    if (unlikely(!ns)) {
   4688        return NVME_INVALID_FIELD | NVME_DNR;
   4689    }
   4690
   4691    /*
   4692     * If the EUI-64 field is 0 and the NGUID field is 0, the namespace must
   4693     * provide a valid Namespace UUID in the Namespace Identification Descriptor
   4694     * data structure. QEMU does not yet support setting NGUID.
   4695     */
   4696    uuid.hdr.nidt = NVME_NIDT_UUID;
   4697    uuid.hdr.nidl = NVME_NIDL_UUID;
   4698    memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
   4699    memcpy(pos, &uuid, sizeof(uuid));
   4700    pos += sizeof(uuid);
   4701
   4702    if (ns->params.eui64) {
   4703        eui64.hdr.nidt = NVME_NIDT_EUI64;
   4704        eui64.hdr.nidl = NVME_NIDL_EUI64;
   4705        eui64.v = cpu_to_be64(ns->params.eui64);
   4706        memcpy(pos, &eui64, sizeof(eui64));
   4707        pos += sizeof(eui64);
   4708    }
   4709
   4710    csi.hdr.nidt = NVME_NIDT_CSI;
   4711    csi.hdr.nidl = NVME_NIDL_CSI;
   4712    csi.v = ns->csi;
   4713    memcpy(pos, &csi, sizeof(csi));
   4714    pos += sizeof(csi);
   4715
   4716    return nvme_c2h(n, list, sizeof(list), req);
   4717}
   4718
   4719static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
   4720{
   4721    uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
   4722    static const int data_len = sizeof(list);
   4723
   4724    trace_pci_nvme_identify_cmd_set();
   4725
   4726    NVME_SET_CSI(*list, NVME_CSI_NVM);
   4727    NVME_SET_CSI(*list, NVME_CSI_ZONED);
   4728
   4729    return nvme_c2h(n, list, data_len, req);
   4730}
   4731
   4732static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
   4733{
   4734    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
   4735
   4736    trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
   4737                            c->csi);
   4738
   4739    switch (c->cns) {
   4740    case NVME_ID_CNS_NS:
   4741        return nvme_identify_ns(n, req, true);
   4742    case NVME_ID_CNS_NS_PRESENT:
   4743        return nvme_identify_ns(n, req, false);
   4744    case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
   4745        return nvme_identify_ctrl_list(n, req, true);
   4746    case NVME_ID_CNS_CTRL_LIST:
   4747        return nvme_identify_ctrl_list(n, req, false);
   4748    case NVME_ID_CNS_CS_NS:
   4749        return nvme_identify_ns_csi(n, req, true);
   4750    case NVME_ID_CNS_CS_NS_PRESENT:
   4751        return nvme_identify_ns_csi(n, req, false);
   4752    case NVME_ID_CNS_CTRL:
   4753        return nvme_identify_ctrl(n, req);
   4754    case NVME_ID_CNS_CS_CTRL:
   4755        return nvme_identify_ctrl_csi(n, req);
   4756    case NVME_ID_CNS_NS_ACTIVE_LIST:
   4757        return nvme_identify_nslist(n, req, true);
   4758    case NVME_ID_CNS_NS_PRESENT_LIST:
   4759        return nvme_identify_nslist(n, req, false);
   4760    case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
   4761        return nvme_identify_nslist_csi(n, req, true);
   4762    case NVME_ID_CNS_CS_NS_PRESENT_LIST:
   4763        return nvme_identify_nslist_csi(n, req, false);
   4764    case NVME_ID_CNS_NS_DESCR_LIST:
   4765        return nvme_identify_ns_descr_list(n, req);
   4766    case NVME_ID_CNS_IO_COMMAND_SET:
   4767        return nvme_identify_cmd_set(n, req);
   4768    default:
   4769        trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
   4770        return NVME_INVALID_FIELD | NVME_DNR;
   4771    }
   4772}
   4773
   4774static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
   4775{
   4776    uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
   4777
   4778    req->cqe.result = 1;
   4779    if (nvme_check_sqid(n, sqid)) {
   4780        return NVME_INVALID_FIELD | NVME_DNR;
   4781    }
   4782
   4783    return NVME_SUCCESS;
   4784}
   4785
   4786static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
   4787{
   4788    trace_pci_nvme_setfeat_timestamp(ts);
   4789
   4790    n->host_timestamp = le64_to_cpu(ts);
   4791    n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
   4792}
   4793
   4794static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
   4795{
   4796    uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
   4797    uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
   4798
   4799    union nvme_timestamp {
   4800        struct {
   4801            uint64_t timestamp:48;
   4802            uint64_t sync:1;
   4803            uint64_t origin:3;
   4804            uint64_t rsvd1:12;
   4805        };
   4806        uint64_t all;
   4807    };
   4808
   4809    union nvme_timestamp ts;
   4810    ts.all = 0;
   4811    ts.timestamp = n->host_timestamp + elapsed_time;
   4812
   4813    /* If the host timestamp is non-zero, set the timestamp origin */
   4814    ts.origin = n->host_timestamp ? 0x01 : 0x00;
   4815
   4816    trace_pci_nvme_getfeat_timestamp(ts.all);
   4817
   4818    return cpu_to_le64(ts.all);
   4819}
   4820
   4821static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
   4822{
   4823    uint64_t timestamp = nvme_get_timestamp(n);
   4824
   4825    return nvme_c2h(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
   4826}
   4827
   4828static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
   4829{
   4830    NvmeCmd *cmd = &req->cmd;
   4831    uint32_t dw10 = le32_to_cpu(cmd->cdw10);
   4832    uint32_t dw11 = le32_to_cpu(cmd->cdw11);
   4833    uint32_t nsid = le32_to_cpu(cmd->nsid);
   4834    uint32_t result;
   4835    uint8_t fid = NVME_GETSETFEAT_FID(dw10);
   4836    NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
   4837    uint16_t iv;
   4838    NvmeNamespace *ns;
   4839    int i;
   4840
   4841    static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
   4842        [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
   4843    };
   4844
   4845    trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
   4846
   4847    if (!nvme_feature_support[fid]) {
   4848        return NVME_INVALID_FIELD | NVME_DNR;
   4849    }
   4850
   4851    if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
   4852        if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
   4853            /*
   4854             * The Reservation Notification Mask and Reservation Persistence
   4855             * features require a status code of Invalid Field in Command when
   4856             * NSID is FFFFFFFFh. Since the device does not support those
   4857             * features we can always return Invalid Namespace or Format as we
   4858             * should do for all other features.
   4859             */
   4860            return NVME_INVALID_NSID | NVME_DNR;
   4861        }
   4862
   4863        if (!nvme_ns(n, nsid)) {
   4864            return NVME_INVALID_FIELD | NVME_DNR;
   4865        }
   4866    }
   4867
   4868    switch (sel) {
   4869    case NVME_GETFEAT_SELECT_CURRENT:
   4870        break;
   4871    case NVME_GETFEAT_SELECT_SAVED:
   4872        /* no features are saveable by the controller; fallthrough */
   4873    case NVME_GETFEAT_SELECT_DEFAULT:
   4874        goto defaults;
   4875    case NVME_GETFEAT_SELECT_CAP:
   4876        result = nvme_feature_cap[fid];
   4877        goto out;
   4878    }
   4879
   4880    switch (fid) {
   4881    case NVME_TEMPERATURE_THRESHOLD:
   4882        result = 0;
   4883
   4884        /*
   4885         * The controller only implements the Composite Temperature sensor, so
   4886         * return 0 for all other sensors.
   4887         */
   4888        if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
   4889            goto out;
   4890        }
   4891
   4892        switch (NVME_TEMP_THSEL(dw11)) {
   4893        case NVME_TEMP_THSEL_OVER:
   4894            result = n->features.temp_thresh_hi;
   4895            goto out;
   4896        case NVME_TEMP_THSEL_UNDER:
   4897            result = n->features.temp_thresh_low;
   4898            goto out;
   4899        }
   4900
   4901        return NVME_INVALID_FIELD | NVME_DNR;
   4902    case NVME_ERROR_RECOVERY:
   4903        if (!nvme_nsid_valid(n, nsid)) {
   4904            return NVME_INVALID_NSID | NVME_DNR;
   4905        }
   4906
   4907        ns = nvme_ns(n, nsid);
   4908        if (unlikely(!ns)) {
   4909            return NVME_INVALID_FIELD | NVME_DNR;
   4910        }
   4911
   4912        result = ns->features.err_rec;
   4913        goto out;
   4914    case NVME_VOLATILE_WRITE_CACHE:
   4915        result = 0;
   4916        for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
   4917            ns = nvme_ns(n, i);
   4918            if (!ns) {
   4919                continue;
   4920            }
   4921
   4922            result = blk_enable_write_cache(ns->blkconf.blk);
   4923            if (result) {
   4924                break;
   4925            }
   4926        }
   4927        trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
   4928        goto out;
   4929    case NVME_ASYNCHRONOUS_EVENT_CONF:
   4930        result = n->features.async_config;
   4931        goto out;
   4932    case NVME_TIMESTAMP:
   4933        return nvme_get_feature_timestamp(n, req);
   4934    default:
   4935        break;
   4936    }
   4937
   4938defaults:
   4939    switch (fid) {
   4940    case NVME_TEMPERATURE_THRESHOLD:
   4941        result = 0;
   4942
   4943        if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
   4944            break;
   4945        }
   4946
   4947        if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
   4948            result = NVME_TEMPERATURE_WARNING;
   4949        }
   4950
   4951        break;
   4952    case NVME_NUMBER_OF_QUEUES:
   4953        result = (n->params.max_ioqpairs - 1) |
   4954            ((n->params.max_ioqpairs - 1) << 16);
   4955        trace_pci_nvme_getfeat_numq(result);
   4956        break;
   4957    case NVME_INTERRUPT_VECTOR_CONF:
   4958        iv = dw11 & 0xffff;
   4959        if (iv >= n->params.max_ioqpairs + 1) {
   4960            return NVME_INVALID_FIELD | NVME_DNR;
   4961        }
   4962
   4963        result = iv;
   4964        if (iv == n->admin_cq.vector) {
   4965            result |= NVME_INTVC_NOCOALESCING;
   4966        }
   4967        break;
   4968    default:
   4969        result = nvme_feature_default[fid];
   4970        break;
   4971    }
   4972
   4973out:
   4974    req->cqe.result = cpu_to_le32(result);
   4975    return NVME_SUCCESS;
   4976}
   4977
   4978static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
   4979{
   4980    uint16_t ret;
   4981    uint64_t timestamp;
   4982
   4983    ret = nvme_h2c(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
   4984    if (ret) {
   4985        return ret;
   4986    }
   4987
   4988    nvme_set_timestamp(n, timestamp);
   4989
   4990    return NVME_SUCCESS;
   4991}
   4992
   4993static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
   4994{
   4995    NvmeNamespace *ns = NULL;
   4996
   4997    NvmeCmd *cmd = &req->cmd;
   4998    uint32_t dw10 = le32_to_cpu(cmd->cdw10);
   4999    uint32_t dw11 = le32_to_cpu(cmd->cdw11);
   5000    uint32_t nsid = le32_to_cpu(cmd->nsid);
   5001    uint8_t fid = NVME_GETSETFEAT_FID(dw10);
   5002    uint8_t save = NVME_SETFEAT_SAVE(dw10);
   5003    int i;
   5004
   5005    trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
   5006
   5007    if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
   5008        return NVME_FID_NOT_SAVEABLE | NVME_DNR;
   5009    }
   5010
   5011    if (!nvme_feature_support[fid]) {
   5012        return NVME_INVALID_FIELD | NVME_DNR;
   5013    }
   5014
   5015    if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
   5016        if (nsid != NVME_NSID_BROADCAST) {
   5017            if (!nvme_nsid_valid(n, nsid)) {
   5018                return NVME_INVALID_NSID | NVME_DNR;
   5019            }
   5020
   5021            ns = nvme_ns(n, nsid);
   5022            if (unlikely(!ns)) {
   5023                return NVME_INVALID_FIELD | NVME_DNR;
   5024            }
   5025        }
   5026    } else if (nsid && nsid != NVME_NSID_BROADCAST) {
   5027        if (!nvme_nsid_valid(n, nsid)) {
   5028            return NVME_INVALID_NSID | NVME_DNR;
   5029        }
   5030
   5031        return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
   5032    }
   5033
   5034    if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
   5035        return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
   5036    }
   5037
   5038    switch (fid) {
   5039    case NVME_TEMPERATURE_THRESHOLD:
   5040        if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
   5041            break;
   5042        }
   5043
   5044        switch (NVME_TEMP_THSEL(dw11)) {
   5045        case NVME_TEMP_THSEL_OVER:
   5046            n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
   5047            break;
   5048        case NVME_TEMP_THSEL_UNDER:
   5049            n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
   5050            break;
   5051        default:
   5052            return NVME_INVALID_FIELD | NVME_DNR;
   5053        }
   5054
   5055        if ((n->temperature >= n->features.temp_thresh_hi) ||
   5056            (n->temperature <= n->features.temp_thresh_low)) {
   5057            nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
   5058        }
   5059
   5060        break;
   5061    case NVME_ERROR_RECOVERY:
   5062        if (nsid == NVME_NSID_BROADCAST) {
   5063            for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
   5064                ns = nvme_ns(n, i);
   5065
   5066                if (!ns) {
   5067                    continue;
   5068                }
   5069
   5070                if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
   5071                    ns->features.err_rec = dw11;
   5072                }
   5073            }
   5074
   5075            break;
   5076        }
   5077
   5078        assert(ns);
   5079        if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat))  {
   5080            ns->features.err_rec = dw11;
   5081        }
   5082        break;
   5083    case NVME_VOLATILE_WRITE_CACHE:
   5084        for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
   5085            ns = nvme_ns(n, i);
   5086            if (!ns) {
   5087                continue;
   5088            }
   5089
   5090            if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
   5091                blk_flush(ns->blkconf.blk);
   5092            }
   5093
   5094            blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
   5095        }
   5096
   5097        break;
   5098
   5099    case NVME_NUMBER_OF_QUEUES:
   5100        if (n->qs_created) {
   5101            return NVME_CMD_SEQ_ERROR | NVME_DNR;
   5102        }
   5103
   5104        /*
   5105         * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
   5106         * and NSQR.
   5107         */
   5108        if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
   5109            return NVME_INVALID_FIELD | NVME_DNR;
   5110        }
   5111
   5112        trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
   5113                                    ((dw11 >> 16) & 0xffff) + 1,
   5114                                    n->params.max_ioqpairs,
   5115                                    n->params.max_ioqpairs);
   5116        req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
   5117                                      ((n->params.max_ioqpairs - 1) << 16));
   5118        break;
   5119    case NVME_ASYNCHRONOUS_EVENT_CONF:
   5120        n->features.async_config = dw11;
   5121        break;
   5122    case NVME_TIMESTAMP:
   5123        return nvme_set_feature_timestamp(n, req);
   5124    case NVME_COMMAND_SET_PROFILE:
   5125        if (dw11 & 0x1ff) {
   5126            trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
   5127            return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
   5128        }
   5129        break;
   5130    default:
   5131        return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
   5132    }
   5133    return NVME_SUCCESS;
   5134}
   5135
   5136static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
   5137{
   5138    trace_pci_nvme_aer(nvme_cid(req));
   5139
   5140    if (n->outstanding_aers > n->params.aerl) {
   5141        trace_pci_nvme_aer_aerl_exceeded();
   5142        return NVME_AER_LIMIT_EXCEEDED;
   5143    }
   5144
   5145    n->aer_reqs[n->outstanding_aers] = req;
   5146    n->outstanding_aers++;
   5147
   5148    if (!QTAILQ_EMPTY(&n->aer_queue)) {
   5149        nvme_process_aers(n);
   5150    }
   5151
   5152    return NVME_NO_COMPLETE;
   5153}
   5154
   5155static void nvme_update_dmrsl(NvmeCtrl *n)
   5156{
   5157    int nsid;
   5158
   5159    for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
   5160        NvmeNamespace *ns = nvme_ns(n, nsid);
   5161        if (!ns) {
   5162            continue;
   5163        }
   5164
   5165        n->dmrsl = MIN_NON_ZERO(n->dmrsl,
   5166                                BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
   5167    }
   5168}
   5169
   5170static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
   5171{
   5172    uint32_t cc = ldl_le_p(&n->bar.cc);
   5173
   5174    ns->iocs = nvme_cse_iocs_none;
   5175    switch (ns->csi) {
   5176    case NVME_CSI_NVM:
   5177        if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
   5178            ns->iocs = nvme_cse_iocs_nvm;
   5179        }
   5180        break;
   5181    case NVME_CSI_ZONED:
   5182        if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
   5183            ns->iocs = nvme_cse_iocs_zoned;
   5184        } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
   5185            ns->iocs = nvme_cse_iocs_nvm;
   5186        }
   5187        break;
   5188    }
   5189}
   5190
   5191static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
   5192{
   5193    NvmeNamespace *ns;
   5194    NvmeCtrl *ctrl;
   5195    uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
   5196    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
   5197    uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
   5198    uint8_t sel = dw10 & 0xf;
   5199    uint16_t *nr_ids = &list[0];
   5200    uint16_t *ids = &list[1];
   5201    uint16_t ret;
   5202    int i;
   5203
   5204    trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
   5205
   5206    if (!nvme_nsid_valid(n, nsid)) {
   5207        return NVME_INVALID_NSID | NVME_DNR;
   5208    }
   5209
   5210    ns = nvme_subsys_ns(n->subsys, nsid);
   5211    if (!ns) {
   5212        return NVME_INVALID_FIELD | NVME_DNR;
   5213    }
   5214
   5215    ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
   5216    if (ret) {
   5217        return ret;
   5218    }
   5219
   5220    if (!*nr_ids) {
   5221        return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
   5222    }
   5223
   5224    *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
   5225    for (i = 0; i < *nr_ids; i++) {
   5226        ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
   5227        if (!ctrl) {
   5228            return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
   5229        }
   5230
   5231        switch (sel) {
   5232        case NVME_NS_ATTACHMENT_ATTACH:
   5233            if (nvme_ns(ctrl, nsid)) {
   5234                return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
   5235            }
   5236
   5237            if (ns->attached && !ns->params.shared) {
   5238                return NVME_NS_PRIVATE | NVME_DNR;
   5239            }
   5240
   5241            nvme_attach_ns(ctrl, ns);
   5242            nvme_select_iocs_ns(ctrl, ns);
   5243
   5244            break;
   5245
   5246        case NVME_NS_ATTACHMENT_DETACH:
   5247            if (!nvme_ns(ctrl, nsid)) {
   5248                return NVME_NS_NOT_ATTACHED | NVME_DNR;
   5249            }
   5250
   5251            ctrl->namespaces[nsid] = NULL;
   5252            ns->attached--;
   5253
   5254            nvme_update_dmrsl(ctrl);
   5255
   5256            break;
   5257
   5258        default:
   5259            return NVME_INVALID_FIELD | NVME_DNR;
   5260        }
   5261
   5262        /*
   5263         * Add namespace id to the changed namespace id list for event clearing
   5264         * via Get Log Page command.
   5265         */
   5266        if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
   5267            nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
   5268                               NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
   5269                               NVME_LOG_CHANGED_NSLIST);
   5270        }
   5271    }
   5272
   5273    return NVME_SUCCESS;
   5274}
   5275
   5276typedef struct NvmeFormatAIOCB {
   5277    BlockAIOCB common;
   5278    BlockAIOCB *aiocb;
   5279    QEMUBH *bh;
   5280    NvmeRequest *req;
   5281    int ret;
   5282
   5283    NvmeNamespace *ns;
   5284    uint32_t nsid;
   5285    bool broadcast;
   5286    int64_t offset;
   5287} NvmeFormatAIOCB;
   5288
   5289static void nvme_format_bh(void *opaque);
   5290
   5291static void nvme_format_cancel(BlockAIOCB *aiocb)
   5292{
   5293    NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
   5294
   5295    if (iocb->aiocb) {
   5296        blk_aio_cancel_async(iocb->aiocb);
   5297    }
   5298}
   5299
   5300static const AIOCBInfo nvme_format_aiocb_info = {
   5301    .aiocb_size = sizeof(NvmeFormatAIOCB),
   5302    .cancel_async = nvme_format_cancel,
   5303    .get_aio_context = nvme_get_aio_context,
   5304};
   5305
   5306static void nvme_format_set(NvmeNamespace *ns, NvmeCmd *cmd)
   5307{
   5308    uint32_t dw10 = le32_to_cpu(cmd->cdw10);
   5309    uint8_t lbaf = dw10 & 0xf;
   5310    uint8_t pi = (dw10 >> 5) & 0x7;
   5311    uint8_t mset = (dw10 >> 4) & 0x1;
   5312    uint8_t pil = (dw10 >> 8) & 0x1;
   5313
   5314    trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
   5315
   5316    ns->id_ns.dps = (pil << 3) | pi;
   5317    ns->id_ns.flbas = lbaf | (mset << 4);
   5318
   5319    nvme_ns_init_format(ns);
   5320}
   5321
   5322static void nvme_format_ns_cb(void *opaque, int ret)
   5323{
   5324    NvmeFormatAIOCB *iocb = opaque;
   5325    NvmeRequest *req = iocb->req;
   5326    NvmeNamespace *ns = iocb->ns;
   5327    int bytes;
   5328
   5329    if (ret < 0) {
   5330        iocb->ret = ret;
   5331        goto done;
   5332    }
   5333
   5334    assert(ns);
   5335
   5336    if (iocb->offset < ns->size) {
   5337        bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
   5338
   5339        iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
   5340                                            bytes, BDRV_REQ_MAY_UNMAP,
   5341                                            nvme_format_ns_cb, iocb);
   5342
   5343        iocb->offset += bytes;
   5344        return;
   5345    }
   5346
   5347    nvme_format_set(ns, &req->cmd);
   5348    ns->status = 0x0;
   5349    iocb->ns = NULL;
   5350    iocb->offset = 0;
   5351
   5352done:
   5353    iocb->aiocb = NULL;
   5354    qemu_bh_schedule(iocb->bh);
   5355}
   5356
   5357static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
   5358{
   5359    if (ns->params.zoned) {
   5360        return NVME_INVALID_FORMAT | NVME_DNR;
   5361    }
   5362
   5363    if (lbaf > ns->id_ns.nlbaf) {
   5364        return NVME_INVALID_FORMAT | NVME_DNR;
   5365    }
   5366
   5367    if (pi && (ns->id_ns.lbaf[lbaf].ms < sizeof(NvmeDifTuple))) {
   5368        return NVME_INVALID_FORMAT | NVME_DNR;
   5369    }
   5370
   5371    if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
   5372        return NVME_INVALID_FIELD | NVME_DNR;
   5373    }
   5374
   5375    return NVME_SUCCESS;
   5376}
   5377
   5378static void nvme_format_bh(void *opaque)
   5379{
   5380    NvmeFormatAIOCB *iocb = opaque;
   5381    NvmeRequest *req = iocb->req;
   5382    NvmeCtrl *n = nvme_ctrl(req);
   5383    uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
   5384    uint8_t lbaf = dw10 & 0xf;
   5385    uint8_t pi = (dw10 >> 5) & 0x7;
   5386    uint16_t status;
   5387    int i;
   5388
   5389    if (iocb->ret < 0) {
   5390        goto done;
   5391    }
   5392
   5393    if (iocb->broadcast) {
   5394        for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
   5395            iocb->ns = nvme_ns(n, i);
   5396            if (iocb->ns) {
   5397                iocb->nsid = i;
   5398                break;
   5399            }
   5400        }
   5401    }
   5402
   5403    if (!iocb->ns) {
   5404        goto done;
   5405    }
   5406
   5407    status = nvme_format_check(iocb->ns, lbaf, pi);
   5408    if (status) {
   5409        req->status = status;
   5410        goto done;
   5411    }
   5412
   5413    iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
   5414    nvme_format_ns_cb(iocb, 0);
   5415    return;
   5416
   5417done:
   5418    qemu_bh_delete(iocb->bh);
   5419    iocb->bh = NULL;
   5420
   5421    iocb->common.cb(iocb->common.opaque, iocb->ret);
   5422
   5423    qemu_aio_unref(iocb);
   5424}
   5425
   5426static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
   5427{
   5428    NvmeFormatAIOCB *iocb;
   5429    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
   5430    uint16_t status;
   5431
   5432    iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
   5433
   5434    iocb->req = req;
   5435    iocb->bh = qemu_bh_new(nvme_format_bh, iocb);
   5436    iocb->ret = 0;
   5437    iocb->ns = NULL;
   5438    iocb->nsid = 0;
   5439    iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
   5440    iocb->offset = 0;
   5441
   5442    if (!iocb->broadcast) {
   5443        if (!nvme_nsid_valid(n, nsid)) {
   5444            status = NVME_INVALID_NSID | NVME_DNR;
   5445            goto out;
   5446        }
   5447
   5448        iocb->ns = nvme_ns(n, nsid);
   5449        if (!iocb->ns) {
   5450            status = NVME_INVALID_FIELD | NVME_DNR;
   5451            goto out;
   5452        }
   5453    }
   5454
   5455    req->aiocb = &iocb->common;
   5456    qemu_bh_schedule(iocb->bh);
   5457
   5458    return NVME_NO_COMPLETE;
   5459
   5460out:
   5461    qemu_bh_delete(iocb->bh);
   5462    iocb->bh = NULL;
   5463    qemu_aio_unref(iocb);
   5464    return status;
   5465}
   5466
   5467static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
   5468{
   5469    trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
   5470                             nvme_adm_opc_str(req->cmd.opcode));
   5471
   5472    if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
   5473        trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
   5474        return NVME_INVALID_OPCODE | NVME_DNR;
   5475    }
   5476
   5477    /* SGLs shall not be used for Admin commands in NVMe over PCIe */
   5478    if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
   5479        return NVME_INVALID_FIELD | NVME_DNR;
   5480    }
   5481
   5482    if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
   5483        return NVME_INVALID_FIELD;
   5484    }
   5485
   5486    switch (req->cmd.opcode) {
   5487    case NVME_ADM_CMD_DELETE_SQ:
   5488        return nvme_del_sq(n, req);
   5489    case NVME_ADM_CMD_CREATE_SQ:
   5490        return nvme_create_sq(n, req);
   5491    case NVME_ADM_CMD_GET_LOG_PAGE:
   5492        return nvme_get_log(n, req);
   5493    case NVME_ADM_CMD_DELETE_CQ:
   5494        return nvme_del_cq(n, req);
   5495    case NVME_ADM_CMD_CREATE_CQ:
   5496        return nvme_create_cq(n, req);
   5497    case NVME_ADM_CMD_IDENTIFY:
   5498        return nvme_identify(n, req);
   5499    case NVME_ADM_CMD_ABORT:
   5500        return nvme_abort(n, req);
   5501    case NVME_ADM_CMD_SET_FEATURES:
   5502        return nvme_set_feature(n, req);
   5503    case NVME_ADM_CMD_GET_FEATURES:
   5504        return nvme_get_feature(n, req);
   5505    case NVME_ADM_CMD_ASYNC_EV_REQ:
   5506        return nvme_aer(n, req);
   5507    case NVME_ADM_CMD_NS_ATTACHMENT:
   5508        return nvme_ns_attachment(n, req);
   5509    case NVME_ADM_CMD_FORMAT_NVM:
   5510        return nvme_format(n, req);
   5511    default:
   5512        assert(false);
   5513    }
   5514
   5515    return NVME_INVALID_OPCODE | NVME_DNR;
   5516}
   5517
   5518static void nvme_process_sq(void *opaque)
   5519{
   5520    NvmeSQueue *sq = opaque;
   5521    NvmeCtrl *n = sq->ctrl;
   5522    NvmeCQueue *cq = n->cq[sq->cqid];
   5523
   5524    uint16_t status;
   5525    hwaddr addr;
   5526    NvmeCmd cmd;
   5527    NvmeRequest *req;
   5528
   5529    while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
   5530        addr = sq->dma_addr + sq->head * n->sqe_size;
   5531        if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
   5532            trace_pci_nvme_err_addr_read(addr);
   5533            trace_pci_nvme_err_cfs();
   5534            stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
   5535            break;
   5536        }
   5537        nvme_inc_sq_head(sq);
   5538
   5539        req = QTAILQ_FIRST(&sq->req_list);
   5540        QTAILQ_REMOVE(&sq->req_list, req, entry);
   5541        QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
   5542        nvme_req_clear(req);
   5543        req->cqe.cid = cmd.cid;
   5544        memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
   5545
   5546        status = sq->sqid ? nvme_io_cmd(n, req) :
   5547            nvme_admin_cmd(n, req);
   5548        if (status != NVME_NO_COMPLETE) {
   5549            req->status = status;
   5550            nvme_enqueue_req_completion(cq, req);
   5551        }
   5552    }
   5553}
   5554
   5555static void nvme_ctrl_reset(NvmeCtrl *n)
   5556{
   5557    NvmeNamespace *ns;
   5558    int i;
   5559
   5560    for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
   5561        ns = nvme_ns(n, i);
   5562        if (!ns) {
   5563            continue;
   5564        }
   5565
   5566        nvme_ns_drain(ns);
   5567    }
   5568
   5569    for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
   5570        if (n->sq[i] != NULL) {
   5571            nvme_free_sq(n->sq[i], n);
   5572        }
   5573    }
   5574    for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
   5575        if (n->cq[i] != NULL) {
   5576            nvme_free_cq(n->cq[i], n);
   5577        }
   5578    }
   5579
   5580    while (!QTAILQ_EMPTY(&n->aer_queue)) {
   5581        NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
   5582        QTAILQ_REMOVE(&n->aer_queue, event, entry);
   5583        g_free(event);
   5584    }
   5585
   5586    n->aer_queued = 0;
   5587    n->outstanding_aers = 0;
   5588    n->qs_created = false;
   5589}
   5590
   5591static void nvme_ctrl_shutdown(NvmeCtrl *n)
   5592{
   5593    NvmeNamespace *ns;
   5594    int i;
   5595
   5596    if (n->pmr.dev) {
   5597        memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
   5598    }
   5599
   5600    for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
   5601        ns = nvme_ns(n, i);
   5602        if (!ns) {
   5603            continue;
   5604        }
   5605
   5606        nvme_ns_shutdown(ns);
   5607    }
   5608}
   5609
   5610static void nvme_select_iocs(NvmeCtrl *n)
   5611{
   5612    NvmeNamespace *ns;
   5613    int i;
   5614
   5615    for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
   5616        ns = nvme_ns(n, i);
   5617        if (!ns) {
   5618            continue;
   5619        }
   5620
   5621        nvme_select_iocs_ns(n, ns);
   5622    }
   5623}
   5624
   5625static int nvme_start_ctrl(NvmeCtrl *n)
   5626{
   5627    uint64_t cap = ldq_le_p(&n->bar.cap);
   5628    uint32_t cc = ldl_le_p(&n->bar.cc);
   5629    uint32_t aqa = ldl_le_p(&n->bar.aqa);
   5630    uint64_t asq = ldq_le_p(&n->bar.asq);
   5631    uint64_t acq = ldq_le_p(&n->bar.acq);
   5632    uint32_t page_bits = NVME_CC_MPS(cc) + 12;
   5633    uint32_t page_size = 1 << page_bits;
   5634
   5635    if (unlikely(n->cq[0])) {
   5636        trace_pci_nvme_err_startfail_cq();
   5637        return -1;
   5638    }
   5639    if (unlikely(n->sq[0])) {
   5640        trace_pci_nvme_err_startfail_sq();
   5641        return -1;
   5642    }
   5643    if (unlikely(asq & (page_size - 1))) {
   5644        trace_pci_nvme_err_startfail_asq_misaligned(asq);
   5645        return -1;
   5646    }
   5647    if (unlikely(acq & (page_size - 1))) {
   5648        trace_pci_nvme_err_startfail_acq_misaligned(acq);
   5649        return -1;
   5650    }
   5651    if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
   5652        trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
   5653        return -1;
   5654    }
   5655    if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
   5656        trace_pci_nvme_err_startfail_page_too_small(
   5657                    NVME_CC_MPS(cc),
   5658                    NVME_CAP_MPSMIN(cap));
   5659        return -1;
   5660    }
   5661    if (unlikely(NVME_CC_MPS(cc) >
   5662                 NVME_CAP_MPSMAX(cap))) {
   5663        trace_pci_nvme_err_startfail_page_too_large(
   5664                    NVME_CC_MPS(cc),
   5665                    NVME_CAP_MPSMAX(cap));
   5666        return -1;
   5667    }
   5668    if (unlikely(NVME_CC_IOCQES(cc) <
   5669                 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
   5670        trace_pci_nvme_err_startfail_cqent_too_small(
   5671                    NVME_CC_IOCQES(cc),
   5672                    NVME_CTRL_CQES_MIN(cap));
   5673        return -1;
   5674    }
   5675    if (unlikely(NVME_CC_IOCQES(cc) >
   5676                 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
   5677        trace_pci_nvme_err_startfail_cqent_too_large(
   5678                    NVME_CC_IOCQES(cc),
   5679                    NVME_CTRL_CQES_MAX(cap));
   5680        return -1;
   5681    }
   5682    if (unlikely(NVME_CC_IOSQES(cc) <
   5683                 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
   5684        trace_pci_nvme_err_startfail_sqent_too_small(
   5685                    NVME_CC_IOSQES(cc),
   5686                    NVME_CTRL_SQES_MIN(cap));
   5687        return -1;
   5688    }
   5689    if (unlikely(NVME_CC_IOSQES(cc) >
   5690                 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
   5691        trace_pci_nvme_err_startfail_sqent_too_large(
   5692                    NVME_CC_IOSQES(cc),
   5693                    NVME_CTRL_SQES_MAX(cap));
   5694        return -1;
   5695    }
   5696    if (unlikely(!NVME_AQA_ASQS(aqa))) {
   5697        trace_pci_nvme_err_startfail_asqent_sz_zero();
   5698        return -1;
   5699    }
   5700    if (unlikely(!NVME_AQA_ACQS(aqa))) {
   5701        trace_pci_nvme_err_startfail_acqent_sz_zero();
   5702        return -1;
   5703    }
   5704
   5705    n->page_bits = page_bits;
   5706    n->page_size = page_size;
   5707    n->max_prp_ents = n->page_size / sizeof(uint64_t);
   5708    n->cqe_size = 1 << NVME_CC_IOCQES(cc);
   5709    n->sqe_size = 1 << NVME_CC_IOSQES(cc);
   5710    nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
   5711    nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
   5712
   5713    nvme_set_timestamp(n, 0ULL);
   5714
   5715    QTAILQ_INIT(&n->aer_queue);
   5716
   5717    nvme_select_iocs(n);
   5718
   5719    return 0;
   5720}
   5721
   5722static void nvme_cmb_enable_regs(NvmeCtrl *n)
   5723{
   5724    uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
   5725    uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
   5726
   5727    NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
   5728    NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
   5729    NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
   5730    stl_le_p(&n->bar.cmbloc, cmbloc);
   5731
   5732    NVME_CMBSZ_SET_SQS(cmbsz, 1);
   5733    NVME_CMBSZ_SET_CQS(cmbsz, 0);
   5734    NVME_CMBSZ_SET_LISTS(cmbsz, 1);
   5735    NVME_CMBSZ_SET_RDS(cmbsz, 1);
   5736    NVME_CMBSZ_SET_WDS(cmbsz, 1);
   5737    NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */
   5738    NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
   5739    stl_le_p(&n->bar.cmbsz, cmbsz);
   5740}
   5741
   5742static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
   5743                           unsigned size)
   5744{
   5745    uint64_t cap = ldq_le_p(&n->bar.cap);
   5746    uint32_t cc = ldl_le_p(&n->bar.cc);
   5747    uint32_t intms = ldl_le_p(&n->bar.intms);
   5748    uint32_t csts = ldl_le_p(&n->bar.csts);
   5749    uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
   5750
   5751    if (unlikely(offset & (sizeof(uint32_t) - 1))) {
   5752        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
   5753                       "MMIO write not 32-bit aligned,"
   5754                       " offset=0x%"PRIx64"", offset);
   5755        /* should be ignored, fall through for now */
   5756    }
   5757
   5758    if (unlikely(size < sizeof(uint32_t))) {
   5759        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
   5760                       "MMIO write smaller than 32-bits,"
   5761                       " offset=0x%"PRIx64", size=%u",
   5762                       offset, size);
   5763        /* should be ignored, fall through for now */
   5764    }
   5765
   5766    switch (offset) {
   5767    case NVME_REG_INTMS:
   5768        if (unlikely(msix_enabled(&(n->parent_obj)))) {
   5769            NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
   5770                           "undefined access to interrupt mask set"
   5771                           " when MSI-X is enabled");
   5772            /* should be ignored, fall through for now */
   5773        }
   5774        intms |= data;
   5775        stl_le_p(&n->bar.intms, intms);
   5776        n->bar.intmc = n->bar.intms;
   5777        trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
   5778        nvme_irq_check(n);
   5779        break;
   5780    case NVME_REG_INTMC:
   5781        if (unlikely(msix_enabled(&(n->parent_obj)))) {
   5782            NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
   5783                           "undefined access to interrupt mask clr"
   5784                           " when MSI-X is enabled");
   5785            /* should be ignored, fall through for now */
   5786        }
   5787        intms &= ~data;
   5788        stl_le_p(&n->bar.intms, intms);
   5789        n->bar.intmc = n->bar.intms;
   5790        trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
   5791        nvme_irq_check(n);
   5792        break;
   5793    case NVME_REG_CC:
   5794        trace_pci_nvme_mmio_cfg(data & 0xffffffff);
   5795
   5796        /* Windows first sends data, then sends enable bit */
   5797        if (!NVME_CC_EN(data) && !NVME_CC_EN(cc) &&
   5798            !NVME_CC_SHN(data) && !NVME_CC_SHN(cc))
   5799        {
   5800            cc = data;
   5801        }
   5802
   5803        if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
   5804            cc = data;
   5805
   5806            /* flush CC since nvme_start_ctrl() needs the value */
   5807            stl_le_p(&n->bar.cc, cc);
   5808            if (unlikely(nvme_start_ctrl(n))) {
   5809                trace_pci_nvme_err_startfail();
   5810                csts = NVME_CSTS_FAILED;
   5811            } else {
   5812                trace_pci_nvme_mmio_start_success();
   5813                csts = NVME_CSTS_READY;
   5814            }
   5815        } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
   5816            trace_pci_nvme_mmio_stopped();
   5817            nvme_ctrl_reset(n);
   5818            cc = 0;
   5819            csts &= ~NVME_CSTS_READY;
   5820        }
   5821
   5822        if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
   5823            trace_pci_nvme_mmio_shutdown_set();
   5824            nvme_ctrl_shutdown(n);
   5825            cc = data;
   5826            csts |= NVME_CSTS_SHST_COMPLETE;
   5827        } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
   5828            trace_pci_nvme_mmio_shutdown_cleared();
   5829            csts &= ~NVME_CSTS_SHST_COMPLETE;
   5830            cc = data;
   5831        }
   5832
   5833        stl_le_p(&n->bar.cc, cc);
   5834        stl_le_p(&n->bar.csts, csts);
   5835
   5836        break;
   5837    case NVME_REG_CSTS:
   5838        if (data & (1 << 4)) {
   5839            NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
   5840                           "attempted to W1C CSTS.NSSRO"
   5841                           " but CAP.NSSRS is zero (not supported)");
   5842        } else if (data != 0) {
   5843            NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
   5844                           "attempted to set a read only bit"
   5845                           " of controller status");
   5846        }
   5847        break;
   5848    case NVME_REG_NSSR:
   5849        if (data == 0x4e564d65) {
   5850            trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
   5851        } else {
   5852            /* The spec says that writes of other values have no effect */
   5853            return;
   5854        }
   5855        break;
   5856    case NVME_REG_AQA:
   5857        stl_le_p(&n->bar.aqa, data);
   5858        trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
   5859        break;
   5860    case NVME_REG_ASQ:
   5861        stn_le_p(&n->bar.asq, size, data);
   5862        trace_pci_nvme_mmio_asqaddr(data);
   5863        break;
   5864    case NVME_REG_ASQ + 4:
   5865        stl_le_p((uint8_t *)&n->bar.asq + 4, data);
   5866        trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
   5867        break;
   5868    case NVME_REG_ACQ:
   5869        trace_pci_nvme_mmio_acqaddr(data);
   5870        stn_le_p(&n->bar.acq, size, data);
   5871        break;
   5872    case NVME_REG_ACQ + 4:
   5873        stl_le_p((uint8_t *)&n->bar.acq + 4, data);
   5874        trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
   5875        break;
   5876    case NVME_REG_CMBLOC:
   5877        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
   5878                       "invalid write to reserved CMBLOC"
   5879                       " when CMBSZ is zero, ignored");
   5880        return;
   5881    case NVME_REG_CMBSZ:
   5882        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
   5883                       "invalid write to read only CMBSZ, ignored");
   5884        return;
   5885    case NVME_REG_CMBMSC:
   5886        if (!NVME_CAP_CMBS(cap)) {
   5887            return;
   5888        }
   5889
   5890        stn_le_p(&n->bar.cmbmsc, size, data);
   5891        n->cmb.cmse = false;
   5892
   5893        if (NVME_CMBMSC_CRE(data)) {
   5894            nvme_cmb_enable_regs(n);
   5895
   5896            if (NVME_CMBMSC_CMSE(data)) {
   5897                uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
   5898                hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
   5899                if (cba + int128_get64(n->cmb.mem.size) < cba) {
   5900                    uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
   5901                    NVME_CMBSTS_SET_CBAI(cmbsts, 1);
   5902                    stl_le_p(&n->bar.cmbsts, cmbsts);
   5903                    return;
   5904                }
   5905
   5906                n->cmb.cba = cba;
   5907                n->cmb.cmse = true;
   5908            }
   5909        } else {
   5910            n->bar.cmbsz = 0;
   5911            n->bar.cmbloc = 0;
   5912        }
   5913
   5914        return;
   5915    case NVME_REG_CMBMSC + 4:
   5916        stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
   5917        return;
   5918
   5919    case NVME_REG_PMRCAP:
   5920        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
   5921                       "invalid write to PMRCAP register, ignored");
   5922        return;
   5923    case NVME_REG_PMRCTL:
   5924        if (!NVME_CAP_PMRS(cap)) {
   5925            return;
   5926        }
   5927
   5928        stl_le_p(&n->bar.pmrctl, data);
   5929        if (NVME_PMRCTL_EN(data)) {
   5930            memory_region_set_enabled(&n->pmr.dev->mr, true);
   5931            pmrsts = 0;
   5932        } else {
   5933            memory_region_set_enabled(&n->pmr.dev->mr, false);
   5934            NVME_PMRSTS_SET_NRDY(pmrsts, 1);
   5935            n->pmr.cmse = false;
   5936        }
   5937        stl_le_p(&n->bar.pmrsts, pmrsts);
   5938        return;
   5939    case NVME_REG_PMRSTS:
   5940        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
   5941                       "invalid write to PMRSTS register, ignored");
   5942        return;
   5943    case NVME_REG_PMREBS:
   5944        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
   5945                       "invalid write to PMREBS register, ignored");
   5946        return;
   5947    case NVME_REG_PMRSWTP:
   5948        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
   5949                       "invalid write to PMRSWTP register, ignored");
   5950        return;
   5951    case NVME_REG_PMRMSCL:
   5952        if (!NVME_CAP_PMRS(cap)) {
   5953            return;
   5954        }
   5955
   5956        stl_le_p(&n->bar.pmrmscl, data);
   5957        n->pmr.cmse = false;
   5958
   5959        if (NVME_PMRMSCL_CMSE(data)) {
   5960            uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
   5961            hwaddr cba = pmrmscu << 32 |
   5962                (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
   5963            if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
   5964                NVME_PMRSTS_SET_CBAI(pmrsts, 1);
   5965                stl_le_p(&n->bar.pmrsts, pmrsts);
   5966                return;
   5967            }
   5968
   5969            n->pmr.cmse = true;
   5970            n->pmr.cba = cba;
   5971        }
   5972
   5973        return;
   5974    case NVME_REG_PMRMSCU:
   5975        if (!NVME_CAP_PMRS(cap)) {
   5976            return;
   5977        }
   5978
   5979        stl_le_p(&n->bar.pmrmscu, data);
   5980        return;
   5981    default:
   5982        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
   5983                       "invalid MMIO write,"
   5984                       " offset=0x%"PRIx64", data=%"PRIx64"",
   5985                       offset, data);
   5986        break;
   5987    }
   5988}
   5989
   5990static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
   5991{
   5992    NvmeCtrl *n = (NvmeCtrl *)opaque;
   5993    uint8_t *ptr = (uint8_t *)&n->bar;
   5994
   5995    trace_pci_nvme_mmio_read(addr, size);
   5996
   5997    if (unlikely(addr & (sizeof(uint32_t) - 1))) {
   5998        NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
   5999                       "MMIO read not 32-bit aligned,"
   6000                       " offset=0x%"PRIx64"", addr);
   6001        /* should RAZ, fall through for now */
   6002    } else if (unlikely(size < sizeof(uint32_t))) {
   6003        NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
   6004                       "MMIO read smaller than 32-bits,"
   6005                       " offset=0x%"PRIx64"", addr);
   6006        /* should RAZ, fall through for now */
   6007    }
   6008
   6009    if (addr > sizeof(n->bar) - size) {
   6010        NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
   6011                       "MMIO read beyond last register,"
   6012                       " offset=0x%"PRIx64", returning 0", addr);
   6013
   6014        return 0;
   6015    }
   6016
   6017    /*
   6018     * When PMRWBM bit 1 is set then read from
   6019     * from PMRSTS should ensure prior writes
   6020     * made it to persistent media
   6021     */
   6022    if (addr == NVME_REG_PMRSTS &&
   6023        (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
   6024        memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
   6025    }
   6026
   6027    return ldn_le_p(ptr + addr, size);
   6028}
   6029
   6030static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
   6031{
   6032    uint32_t qid;
   6033
   6034    if (unlikely(addr & ((1 << 2) - 1))) {
   6035        NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
   6036                       "doorbell write not 32-bit aligned,"
   6037                       " offset=0x%"PRIx64", ignoring", addr);
   6038        return;
   6039    }
   6040
   6041    if (((addr - 0x1000) >> 2) & 1) {
   6042        /* Completion queue doorbell write */
   6043
   6044        uint16_t new_head = val & 0xffff;
   6045        int start_sqs;
   6046        NvmeCQueue *cq;
   6047
   6048        qid = (addr - (0x1000 + (1 << 2))) >> 3;
   6049        if (unlikely(nvme_check_cqid(n, qid))) {
   6050            NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
   6051                           "completion queue doorbell write"
   6052                           " for nonexistent queue,"
   6053                           " sqid=%"PRIu32", ignoring", qid);
   6054
   6055            /*
   6056             * NVM Express v1.3d, Section 4.1 state: "If host software writes
   6057             * an invalid value to the Submission Queue Tail Doorbell or
   6058             * Completion Queue Head Doorbell regiter and an Asynchronous Event
   6059             * Request command is outstanding, then an asynchronous event is
   6060             * posted to the Admin Completion Queue with a status code of
   6061             * Invalid Doorbell Write Value."
   6062             *
   6063             * Also note that the spec includes the "Invalid Doorbell Register"
   6064             * status code, but nowhere does it specify when to use it.
   6065             * However, it seems reasonable to use it here in a similar
   6066             * fashion.
   6067             */
   6068            if (n->outstanding_aers) {
   6069                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
   6070                                   NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
   6071                                   NVME_LOG_ERROR_INFO);
   6072            }
   6073
   6074            return;
   6075        }
   6076
   6077        cq = n->cq[qid];
   6078        if (unlikely(new_head >= cq->size)) {
   6079            NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
   6080                           "completion queue doorbell write value"
   6081                           " beyond queue size, sqid=%"PRIu32","
   6082                           " new_head=%"PRIu16", ignoring",
   6083                           qid, new_head);
   6084
   6085            if (n->outstanding_aers) {
   6086                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
   6087                                   NVME_AER_INFO_ERR_INVALID_DB_VALUE,
   6088                                   NVME_LOG_ERROR_INFO);
   6089            }
   6090
   6091            return;
   6092        }
   6093
   6094        trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
   6095
   6096        start_sqs = nvme_cq_full(cq) ? 1 : 0;
   6097        cq->head = new_head;
   6098        if (start_sqs) {
   6099            NvmeSQueue *sq;
   6100            QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
   6101                timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
   6102            }
   6103            timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
   6104        }
   6105
   6106        if (cq->tail == cq->head) {
   6107            if (cq->irq_enabled) {
   6108                n->cq_pending--;
   6109            }
   6110
   6111            nvme_irq_deassert(n, cq);
   6112        }
   6113    } else {
   6114        /* Submission queue doorbell write */
   6115
   6116        uint16_t new_tail = val & 0xffff;
   6117        NvmeSQueue *sq;
   6118
   6119        qid = (addr - 0x1000) >> 3;
   6120        if (unlikely(nvme_check_sqid(n, qid))) {
   6121            NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
   6122                           "submission queue doorbell write"
   6123                           " for nonexistent queue,"
   6124                           " sqid=%"PRIu32", ignoring", qid);
   6125
   6126            if (n->outstanding_aers) {
   6127                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
   6128                                   NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
   6129                                   NVME_LOG_ERROR_INFO);
   6130            }
   6131
   6132            return;
   6133        }
   6134
   6135        sq = n->sq[qid];
   6136        if (unlikely(new_tail >= sq->size)) {
   6137            NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
   6138                           "submission queue doorbell write value"
   6139                           " beyond queue size, sqid=%"PRIu32","
   6140                           " new_tail=%"PRIu16", ignoring",
   6141                           qid, new_tail);
   6142
   6143            if (n->outstanding_aers) {
   6144                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
   6145                                   NVME_AER_INFO_ERR_INVALID_DB_VALUE,
   6146                                   NVME_LOG_ERROR_INFO);
   6147            }
   6148
   6149            return;
   6150        }
   6151
   6152        trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
   6153
   6154        sq->tail = new_tail;
   6155        timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
   6156    }
   6157}
   6158
   6159static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
   6160                            unsigned size)
   6161{
   6162    NvmeCtrl *n = (NvmeCtrl *)opaque;
   6163
   6164    trace_pci_nvme_mmio_write(addr, data, size);
   6165
   6166    if (addr < sizeof(n->bar)) {
   6167        nvme_write_bar(n, addr, data, size);
   6168    } else {
   6169        nvme_process_db(n, addr, data);
   6170    }
   6171}
   6172
   6173static const MemoryRegionOps nvme_mmio_ops = {
   6174    .read = nvme_mmio_read,
   6175    .write = nvme_mmio_write,
   6176    .endianness = DEVICE_LITTLE_ENDIAN,
   6177    .impl = {
   6178        .min_access_size = 2,
   6179        .max_access_size = 8,
   6180    },
   6181};
   6182
   6183static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
   6184                           unsigned size)
   6185{
   6186    NvmeCtrl *n = (NvmeCtrl *)opaque;
   6187    stn_le_p(&n->cmb.buf[addr], size, data);
   6188}
   6189
   6190static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
   6191{
   6192    NvmeCtrl *n = (NvmeCtrl *)opaque;
   6193    return ldn_le_p(&n->cmb.buf[addr], size);
   6194}
   6195
   6196static const MemoryRegionOps nvme_cmb_ops = {
   6197    .read = nvme_cmb_read,
   6198    .write = nvme_cmb_write,
   6199    .endianness = DEVICE_LITTLE_ENDIAN,
   6200    .impl = {
   6201        .min_access_size = 1,
   6202        .max_access_size = 8,
   6203    },
   6204};
   6205
   6206static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
   6207{
   6208    NvmeParams *params = &n->params;
   6209
   6210    if (params->num_queues) {
   6211        warn_report("num_queues is deprecated; please use max_ioqpairs "
   6212                    "instead");
   6213
   6214        params->max_ioqpairs = params->num_queues - 1;
   6215    }
   6216
   6217    if (n->namespace.blkconf.blk && n->subsys) {
   6218        error_setg(errp, "subsystem support is unavailable with legacy "
   6219                   "namespace ('drive' property)");
   6220        return;
   6221    }
   6222
   6223    if (params->max_ioqpairs < 1 ||
   6224        params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
   6225        error_setg(errp, "max_ioqpairs must be between 1 and %d",
   6226                   NVME_MAX_IOQPAIRS);
   6227        return;
   6228    }
   6229
   6230    if (params->msix_qsize < 1 ||
   6231        params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
   6232        error_setg(errp, "msix_qsize must be between 1 and %d",
   6233                   PCI_MSIX_FLAGS_QSIZE + 1);
   6234        return;
   6235    }
   6236
   6237    if (!params->serial) {
   6238        error_setg(errp, "serial property not set");
   6239        return;
   6240    }
   6241
   6242    if (n->pmr.dev) {
   6243        if (host_memory_backend_is_mapped(n->pmr.dev)) {
   6244            error_setg(errp, "can't use already busy memdev: %s",
   6245                       object_get_canonical_path_component(OBJECT(n->pmr.dev)));
   6246            return;
   6247        }
   6248
   6249        if (!is_power_of_2(n->pmr.dev->size)) {
   6250            error_setg(errp, "pmr backend size needs to be power of 2 in size");
   6251            return;
   6252        }
   6253
   6254        host_memory_backend_set_mapped(n->pmr.dev, true);
   6255    }
   6256
   6257    if (n->params.zasl > n->params.mdts) {
   6258        error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
   6259                   "than or equal to mdts (Maximum Data Transfer Size)");
   6260        return;
   6261    }
   6262
   6263    if (!n->params.vsl) {
   6264        error_setg(errp, "vsl must be non-zero");
   6265        return;
   6266    }
   6267}
   6268
   6269static void nvme_init_state(NvmeCtrl *n)
   6270{
   6271    /* add one to max_ioqpairs to account for the admin queue pair */
   6272    n->reg_size = pow2ceil(sizeof(NvmeBar) +
   6273                           2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
   6274    n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
   6275    n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
   6276    n->temperature = NVME_TEMPERATURE;
   6277    n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
   6278    n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
   6279    n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
   6280}
   6281
   6282static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
   6283{
   6284    uint64_t cmb_size = n->params.cmb_size_mb * MiB;
   6285    uint64_t cap = ldq_le_p(&n->bar.cap);
   6286
   6287    n->cmb.buf = g_malloc0(cmb_size);
   6288    memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
   6289                          "nvme-cmb", cmb_size);
   6290    pci_register_bar(pci_dev, NVME_CMB_BIR,
   6291                     PCI_BASE_ADDRESS_SPACE_MEMORY |
   6292                     PCI_BASE_ADDRESS_MEM_TYPE_64 |
   6293                     PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
   6294
   6295    NVME_CAP_SET_CMBS(cap, 1);
   6296    stq_le_p(&n->bar.cap, cap);
   6297
   6298    if (n->params.legacy_cmb) {
   6299        nvme_cmb_enable_regs(n);
   6300        n->cmb.cmse = true;
   6301    }
   6302}
   6303
   6304static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
   6305{
   6306    uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
   6307
   6308    NVME_PMRCAP_SET_RDS(pmrcap, 1);
   6309    NVME_PMRCAP_SET_WDS(pmrcap, 1);
   6310    NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
   6311    /* Turn on bit 1 support */
   6312    NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
   6313    NVME_PMRCAP_SET_CMSS(pmrcap, 1);
   6314    stl_le_p(&n->bar.pmrcap, pmrcap);
   6315
   6316    pci_register_bar(pci_dev, NVME_PMR_BIR,
   6317                     PCI_BASE_ADDRESS_SPACE_MEMORY |
   6318                     PCI_BASE_ADDRESS_MEM_TYPE_64 |
   6319                     PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
   6320
   6321    memory_region_set_enabled(&n->pmr.dev->mr, false);
   6322}
   6323
   6324static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
   6325{
   6326    uint8_t *pci_conf = pci_dev->config;
   6327    uint64_t bar_size, msix_table_size, msix_pba_size;
   6328    unsigned msix_table_offset, msix_pba_offset;
   6329    int ret;
   6330
   6331    Error *err = NULL;
   6332
   6333    pci_conf[PCI_INTERRUPT_PIN] = 1;
   6334    pci_config_set_prog_interface(pci_conf, 0x2);
   6335
   6336    if (n->params.use_intel_id) {
   6337        pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
   6338        pci_config_set_device_id(pci_conf, 0x5845);
   6339    } else {
   6340        pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
   6341        pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
   6342    }
   6343
   6344    pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
   6345    pcie_endpoint_cap_init(pci_dev, 0x80);
   6346
   6347    bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
   6348    msix_table_offset = bar_size;
   6349    msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
   6350
   6351    bar_size += msix_table_size;
   6352    bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
   6353    msix_pba_offset = bar_size;
   6354    msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
   6355
   6356    bar_size += msix_pba_size;
   6357    bar_size = pow2ceil(bar_size);
   6358
   6359    memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
   6360    memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
   6361                          n->reg_size);
   6362    memory_region_add_subregion(&n->bar0, 0, &n->iomem);
   6363
   6364    pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
   6365                     PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
   6366    ret = msix_init(pci_dev, n->params.msix_qsize,
   6367                    &n->bar0, 0, msix_table_offset,
   6368                    &n->bar0, 0, msix_pba_offset, 0, &err);
   6369    if (ret < 0) {
   6370        if (ret == -ENOTSUP) {
   6371            warn_report_err(err);
   6372        } else {
   6373            error_propagate(errp, err);
   6374            return ret;
   6375        }
   6376    }
   6377
   6378    if (n->params.cmb_size_mb) {
   6379        nvme_init_cmb(n, pci_dev);
   6380    }
   6381
   6382    if (n->pmr.dev) {
   6383        nvme_init_pmr(n, pci_dev);
   6384    }
   6385
   6386    return 0;
   6387}
   6388
   6389static void nvme_init_subnqn(NvmeCtrl *n)
   6390{
   6391    NvmeSubsystem *subsys = n->subsys;
   6392    NvmeIdCtrl *id = &n->id_ctrl;
   6393
   6394    if (!subsys) {
   6395        snprintf((char *)id->subnqn, sizeof(id->subnqn),
   6396                 "nqn.2019-08.org.qemu:%s", n->params.serial);
   6397    } else {
   6398        pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
   6399    }
   6400}
   6401
   6402static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
   6403{
   6404    NvmeIdCtrl *id = &n->id_ctrl;
   6405    uint8_t *pci_conf = pci_dev->config;
   6406    uint64_t cap = ldq_le_p(&n->bar.cap);
   6407
   6408    id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
   6409    id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
   6410    strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
   6411    strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
   6412    strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
   6413
   6414    id->cntlid = cpu_to_le16(n->cntlid);
   6415
   6416    id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
   6417
   6418    id->rab = 6;
   6419
   6420    if (n->params.use_intel_id) {
   6421        id->ieee[0] = 0xb3;
   6422        id->ieee[1] = 0x02;
   6423        id->ieee[2] = 0x00;
   6424    } else {
   6425        id->ieee[0] = 0x00;
   6426        id->ieee[1] = 0x54;
   6427        id->ieee[2] = 0x52;
   6428    }
   6429
   6430    id->mdts = n->params.mdts;
   6431    id->ver = cpu_to_le32(NVME_SPEC_VER);
   6432    id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
   6433    id->cntrltype = 0x1;
   6434
   6435    /*
   6436     * Because the controller always completes the Abort command immediately,
   6437     * there can never be more than one concurrently executing Abort command,
   6438     * so this value is never used for anything. Note that there can easily be
   6439     * many Abort commands in the queues, but they are not considered
   6440     * "executing" until processed by nvme_abort.
   6441     *
   6442     * The specification recommends a value of 3 for Abort Command Limit (four
   6443     * concurrently outstanding Abort commands), so lets use that though it is
   6444     * inconsequential.
   6445     */
   6446    id->acl = 3;
   6447    id->aerl = n->params.aerl;
   6448    id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
   6449    id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
   6450
   6451    /* recommended default value (~70 C) */
   6452    id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
   6453    id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
   6454
   6455    id->sqes = (0x6 << 4) | 0x6;
   6456    id->cqes = (0x4 << 4) | 0x4;
   6457    id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
   6458    id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
   6459                           NVME_ONCS_FEATURES | NVME_ONCS_DSM |
   6460                           NVME_ONCS_COMPARE | NVME_ONCS_COPY);
   6461
   6462    /*
   6463     * NOTE: If this device ever supports a command set that does NOT use 0x0
   6464     * as a Flush-equivalent operation, support for the broadcast NSID in Flush
   6465     * should probably be removed.
   6466     *
   6467     * See comment in nvme_io_cmd.
   6468     */
   6469    id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
   6470
   6471    id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0);
   6472    id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
   6473                           NVME_CTRL_SGLS_BITBUCKET);
   6474
   6475    nvme_init_subnqn(n);
   6476
   6477    id->psd[0].mp = cpu_to_le16(0x9c4);
   6478    id->psd[0].enlat = cpu_to_le32(0x10);
   6479    id->psd[0].exlat = cpu_to_le32(0x4);
   6480
   6481    if (n->subsys) {
   6482        id->cmic |= NVME_CMIC_MULTI_CTRL;
   6483    }
   6484
   6485    NVME_CAP_SET_MQES(cap, 0x7ff);
   6486    NVME_CAP_SET_CQR(cap, 1);
   6487    NVME_CAP_SET_TO(cap, 0xf);
   6488    NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
   6489    NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
   6490    NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
   6491    NVME_CAP_SET_MPSMAX(cap, 4);
   6492    NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
   6493    NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
   6494    stq_le_p(&n->bar.cap, cap);
   6495
   6496    stl_le_p(&n->bar.vs, NVME_SPEC_VER);
   6497    n->bar.intmc = n->bar.intms = 0;
   6498}
   6499
   6500static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
   6501{
   6502    int cntlid;
   6503
   6504    if (!n->subsys) {
   6505        return 0;
   6506    }
   6507
   6508    cntlid = nvme_subsys_register_ctrl(n, errp);
   6509    if (cntlid < 0) {
   6510        return -1;
   6511    }
   6512
   6513    n->cntlid = cntlid;
   6514
   6515    return 0;
   6516}
   6517
   6518void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
   6519{
   6520    uint32_t nsid = ns->params.nsid;
   6521    assert(nsid && nsid <= NVME_MAX_NAMESPACES);
   6522
   6523    n->namespaces[nsid] = ns;
   6524    ns->attached++;
   6525
   6526    n->dmrsl = MIN_NON_ZERO(n->dmrsl,
   6527                            BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
   6528}
   6529
   6530static void nvme_realize(PCIDevice *pci_dev, Error **errp)
   6531{
   6532    NvmeCtrl *n = NVME(pci_dev);
   6533    NvmeNamespace *ns;
   6534    Error *local_err = NULL;
   6535
   6536    nvme_check_constraints(n, &local_err);
   6537    if (local_err) {
   6538        error_propagate(errp, local_err);
   6539        return;
   6540    }
   6541
   6542    qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
   6543              &pci_dev->qdev, n->parent_obj.qdev.id);
   6544
   6545    nvme_init_state(n);
   6546    if (nvme_init_pci(n, pci_dev, errp)) {
   6547        return;
   6548    }
   6549
   6550    if (nvme_init_subsys(n, errp)) {
   6551        error_propagate(errp, local_err);
   6552        return;
   6553    }
   6554    nvme_init_ctrl(n, pci_dev);
   6555
   6556    /* setup a namespace if the controller drive property was given */
   6557    if (n->namespace.blkconf.blk) {
   6558        ns = &n->namespace;
   6559        ns->params.nsid = 1;
   6560
   6561        if (nvme_ns_setup(ns, errp)) {
   6562            return;
   6563        }
   6564
   6565        nvme_attach_ns(n, ns);
   6566    }
   6567}
   6568
   6569static void nvme_exit(PCIDevice *pci_dev)
   6570{
   6571    NvmeCtrl *n = NVME(pci_dev);
   6572    NvmeNamespace *ns;
   6573    int i;
   6574
   6575    nvme_ctrl_reset(n);
   6576
   6577    if (n->subsys) {
   6578        for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
   6579            ns = nvme_ns(n, i);
   6580            if (ns) {
   6581                ns->attached--;
   6582            }
   6583        }
   6584
   6585        nvme_subsys_unregister_ctrl(n->subsys, n);
   6586    }
   6587
   6588    g_free(n->cq);
   6589    g_free(n->sq);
   6590    g_free(n->aer_reqs);
   6591
   6592    if (n->params.cmb_size_mb) {
   6593        g_free(n->cmb.buf);
   6594    }
   6595
   6596    if (n->pmr.dev) {
   6597        host_memory_backend_set_mapped(n->pmr.dev, false);
   6598    }
   6599    msix_uninit(pci_dev, &n->bar0, &n->bar0);
   6600    memory_region_del_subregion(&n->bar0, &n->iomem);
   6601}
   6602
   6603static Property nvme_props[] = {
   6604    DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
   6605    DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
   6606                     HostMemoryBackend *),
   6607    DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
   6608                     NvmeSubsystem *),
   6609    DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
   6610    DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
   6611    DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
   6612    DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
   6613    DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
   6614    DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
   6615    DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
   6616    DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
   6617    DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
   6618    DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
   6619    DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
   6620    DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
   6621    DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
   6622                     params.auto_transition_zones, true),
   6623    DEFINE_PROP_END_OF_LIST(),
   6624};
   6625
   6626static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
   6627                                   void *opaque, Error **errp)
   6628{
   6629    NvmeCtrl *n = NVME(obj);
   6630    uint8_t value = n->smart_critical_warning;
   6631
   6632    visit_type_uint8(v, name, &value, errp);
   6633}
   6634
   6635static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
   6636                                   void *opaque, Error **errp)
   6637{
   6638    NvmeCtrl *n = NVME(obj);
   6639    uint8_t value, old_value, cap = 0, index, event;
   6640
   6641    if (!visit_type_uint8(v, name, &value, errp)) {
   6642        return;
   6643    }
   6644
   6645    cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
   6646          | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
   6647    if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
   6648        cap |= NVME_SMART_PMR_UNRELIABLE;
   6649    }
   6650
   6651    if ((value & cap) != value) {
   6652        error_setg(errp, "unsupported smart critical warning bits: 0x%x",
   6653                   value & ~cap);
   6654        return;
   6655    }
   6656
   6657    old_value = n->smart_critical_warning;
   6658    n->smart_critical_warning = value;
   6659
   6660    /* only inject new bits of smart critical warning */
   6661    for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
   6662        event = 1 << index;
   6663        if (value & ~old_value & event)
   6664            nvme_smart_event(n, event);
   6665    }
   6666}
   6667
   6668static const VMStateDescription nvme_vmstate = {
   6669    .name = "nvme",
   6670    .unmigratable = 1,
   6671};
   6672
   6673static void nvme_class_init(ObjectClass *oc, void *data)
   6674{
   6675    DeviceClass *dc = DEVICE_CLASS(oc);
   6676    PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
   6677
   6678    pc->realize = nvme_realize;
   6679    pc->exit = nvme_exit;
   6680    pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
   6681    pc->revision = 2;
   6682
   6683    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
   6684    dc->desc = "Non-Volatile Memory Express";
   6685    device_class_set_props(dc, nvme_props);
   6686    dc->vmsd = &nvme_vmstate;
   6687}
   6688
   6689static void nvme_instance_init(Object *obj)
   6690{
   6691    NvmeCtrl *n = NVME(obj);
   6692
   6693    device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
   6694                                  "bootindex", "/namespace@1,0",
   6695                                  DEVICE(obj));
   6696
   6697    object_property_add(obj, "smart_critical_warning", "uint8",
   6698                        nvme_get_smart_warning,
   6699                        nvme_set_smart_warning, NULL, NULL);
   6700}
   6701
   6702static const TypeInfo nvme_info = {
   6703    .name          = TYPE_NVME,
   6704    .parent        = TYPE_PCI_DEVICE,
   6705    .instance_size = sizeof(NvmeCtrl),
   6706    .instance_init = nvme_instance_init,
   6707    .class_init    = nvme_class_init,
   6708    .interfaces = (InterfaceInfo[]) {
   6709        { INTERFACE_PCIE_DEVICE },
   6710        { }
   6711    },
   6712};
   6713
   6714static const TypeInfo nvme_bus_info = {
   6715    .name = TYPE_NVME_BUS,
   6716    .parent = TYPE_BUS,
   6717    .instance_size = sizeof(NvmeBus),
   6718};
   6719
   6720static void nvme_register_types(void)
   6721{
   6722    type_register_static(&nvme_info);
   6723    type_register_static(&nvme_bus_info);
   6724}
   6725
   6726type_init(nvme_register_types)