Re: [PATCH v6 2/4] block: introduce zone append write for zoned devices

2023-03-13 Thread Dmitry Fomichev
On Fri, 2023-03-10 at 18:31 +0800, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of writes is pointing
> to the write pointer of that zone.

s/writes is pointing to the write pointer of that zone/the call may point at any
position within the zone to which the data is being appended/

>  Upon completion the device will
> respond with the position the data

s/position the data/position where the data/

>  has been written in the zone.
> 
> Signed-off-by: Sam Li 

With nits above,

Reviewed-by: Dmitry Fomichev 

> ---
>  block/block-backend.c | 60 +++
>  block/file-posix.c    | 54 +---
>  block/io.c    | 21 +++
>  block/io_uring.c  |  4 +++
>  block/linux-aio.c |  3 ++
>  block/raw-format.c    |  8 +
>  include/block/block-io.h  |  4 +++
>  include/block/block_int-common.h  |  5 +++
>  include/block/raw-aio.h   |  4 ++-
>  include/sysemu/block-backend-io.h |  9 +
>  10 files changed, 166 insertions(+), 6 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index f70b08e3f6..28e8f5d778 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1888,6 +1888,45 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk,
> BlockZoneOp op,
>  return >common;
>  }
>  
> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
> +{
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = >rwco;
> +
> +    rwco->ret = blk_co_zone_append(rwco->blk, >bytes,
> +   rwco->iobuf, rwco->flags);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +    QEMUIOVector *qiov, BdrvRequestFlags flags,
> +    BlockCompletionFunc *cb, void *opaque) {
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +    .blk    = blk,
> +    .ret    = NOT_DONE,
> +    .flags  = flags,
> +    .iobuf  = qiov,
> +    };
> +    acb->bytes = *offset;
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +    aio_co_enter(blk_get_aio_context(blk), co);
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +    replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +    }
> +
> +    return >common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1939,6 +1978,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk,
> BlockZoneOp op,
>  return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +    QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +    if (!blk_is_available(blk)) {
> +    blk_dec_in_flight(blk);
> +    return -ENOMEDIUM;
> +    }
> +
> +    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>  BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 61ed769ac8..2ba9174778 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -160,6 +160,7 @@ typedef struct BDRVRawState {
>  bool has_write_zeroes:1;
>  bool use_linux_aio:1;
>  bool use_linux_io_uring:1;
> +    int64_t *offset; /* offset of zone append operation */
>  int page_cache_inconsistent; /* errno from fdatasync failure */
>  bool has_fallocate;
>  bool needs_alignment;
> @@ -1672,7 +1673,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData
> *aiocb)
>  ssize_t len;
>  
>  len = RETRY_ON_EINTR(
> -    (aiocb->aio_type & QEMU_AIO_WRITE) ?
> +    (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
>  qemu_pwritev(aiocb->aio_fildes,
>     aiocb->io.iov,
> 

Re: [PATCH v6 4/4] block: add some trace events for zone append

2023-03-13 Thread Dmitry Fomichev
On Fri, 2023-03-10 at 18:31 +0800, Sam Li wrote:
> Signed-off-by: Sam Li 

Looks good,

Reviewed-by: Dmitry Fomichev 

>  block/file-posix.c | 3 +++
>  block/trace-events | 2 ++
>  2 files changed, 5 insertions(+)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 2ba9174778..5187f810e5 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -2489,6 +2489,8 @@ out:
>  if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
>  if (type & QEMU_AIO_ZONE_APPEND) {
>  *s->offset = wps->wp[index];
> +    trace_zbd_zone_append_complete(bs, *s->offset
> +    >> BDRV_SECTOR_BITS);
>  }
>  /* Advance the wp if needed */
>  if (offset + bytes > wps->wp[index]) {
> @@ -3537,6 +3539,7 @@ static int coroutine_fn
> raw_co_zone_append(BlockDriverState *bs,
>  len += iov_len;
>  }
>  
> +    trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
>  return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND);
>  }
>  #endif
> diff --git a/block/trace-events b/block/trace-events
> index 3f4e1d088a..32665158d6 100644
> --- a/block/trace-events
> +++ b/block/trace-events
> @@ -211,6 +211,8 @@ file_hdev_is_sg(int type, int version) "SG device found:
> type=%d, version=%d"
>  file_flush_fdatasync_failed(int err) "errno %d"
>  zbd_zone_report(void *bs, unsigned int nr_zones, int64_t sector) "bs %p 
> report
> %d zones starting at sector offset 0x%" PRIx64 ""
>  zbd_zone_mgmt(void *bs, const char *op_name, int64_t sector, int64_t len) "bs
> %p %s starts at sector offset 0x%" PRIx64 " over a range of 0x%" PRIx64 "
> sectors"
> +zbd_zone_append(void *bs, int64_t sector) "bs %p append at sector offset 0x%"
> PRIx64 ""
> +zbd_zone_append_complete(void *bs, int64_t sector) "bs %p returns append
> sector 0x%" PRIx64 ""
>  
>  # ssh.c
>  sftp_error(const char *op, const char *ssh_err, int ssh_err_code, int
> sftp_err_code) "%s failed: %s (libssh error code: %d, sftp error code: %d)"



Re: [PATCH v6 1/4] file-posix: add tracking of the zone write pointers

2023-03-13 Thread Dmitry Fomichev
On Fri, 2023-03-10 at 18:31 +0800, Sam Li wrote:
> Since Linux doesn't have a user API to issue zone append operations to
> zoned devices from user space, the file-posix driver is modified to add
> zone append emulation using regular writes. To do this, the file-posix
> driver tracks the wp location of all zones of the device. It uses an
> array of uint64_t. The most significant bit of each wp location indicates
> if the zone type is conventional zones.
> 
> The zones wp can be changed due to the following operations issued:
> - zone reset: change the wp to the start offset of that zone
> - zone finish: change to the end location of that zone
> - write to a zone
> - zone append
> 
> Signed-off-by: Sam Li 
> ---
>  block/file-posix.c   | 159 ++-
>  include/block/block-common.h |  14 +++
>  include/block/block_int-common.h |   3 +
>  3 files changed, 172 insertions(+), 4 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 563acc76ae..61ed769ac8 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1324,6 +1324,77 @@ static int hdev_get_max_segments(int fd, struct stat
> *st)
>  #endif
>  }
>  
> +#if defined(CONFIG_BLKZONED)
> +static int get_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +    unsigned int nrz) {
> +    struct blk_zone *blkz;
> +    size_t rep_size;
> +    uint64_t sector = offset >> BDRV_SECTOR_BITS;
> +    int ret, n = 0, i = 0;
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct 
> blk_zone);
> +    g_autofree struct blk_zone_report *rep = NULL;
> +
> +    rep = g_malloc(rep_size);
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +    memset(rep, 0, rep_size);
> +    rep->sector = sector;
> +    rep->nr_zones = nrz - n;
> +
> +    do {
> +    ret = ioctl(fd, BLKREPORTZONE, rep);
> +    } while (ret != 0 && errno == EINTR);
> +    if (ret != 0) {
> +    error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> +    fd, offset, errno);
> +    return -errno;
> +    }
> +
> +    if (!rep->nr_zones) {
> +    break;
> +    }
> +
> +    for (i = 0; i < rep->nr_zones; i++, n++) {
> +    /*
> + * The wp tracking cares only about sequential writes required 
> and
> + * sequential write preferred zones so that the wp can advance to
> + * the right location.
> + * Use the most significant bit of the wp location to indicate 
> the
> + * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
> + */
> +    if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
> +    wps->wp[i] = 1ULL << 63;
> +    } else {
> +    switch(blkz[i].cond) {
> +    case BLK_ZONE_COND_FULL:
> +    case BLK_ZONE_COND_READONLY:
> +    /* Zone not writable */
> +    wps->wp[i] = (blkz[i].start + blkz[i].len) <<
> BDRV_SECTOR_BITS;
> +    break;
> +    case BLK_ZONE_COND_OFFLINE:
> +    /* Zone not writable nor readable */
> +    wps->wp[i] = (blkz[i].start) << BDRV_SECTOR_BITS;
> +    break;
> +    default:
> +    wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> +    break;
> +    }
> +    }
> +    }
> +    sector = blkz[i - 1].start + blkz[i - 1].len;
> +    }
> +
> +    return 0;
> +}
> +
> +static void update_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +    unsigned int nrz) {
> +    if (get_zones_wp(fd, wps, offset, nrz) < 0) {
> +    error_report("update zone wp failed");
> +    }
> +}
> +#endif
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>  BDRVRawState *s = bs->opaque;
> @@ -1413,6 +1484,21 @@ static void raw_refresh_limits(BlockDriverState *bs,
> Error **errp)
>  if (ret >= 0) {
>  bs->bl.max_active_zones = ret;
>  }
> +
> +    ret = get_sysfs_long_val(, "physical_block_size");
> +    if (ret >= 0) {
> +    bs->bl.write_granularity = ret;
> +    }
> +
> +    bs->bl.wps = g_malloc(sizeof(BlockZoneWps) +
> +    sizeof(int64_t) * bs->bl.nr_zones);
> +    ret = get_zones_wp(s->fd, bs->bl.wps, 0, bs->bl.nr_zones);
> +    if (ret < 0) {
> +    error_setg_errno(errp, -ret, "report wps failed");
> +    g_free(bs->bl.wps);
> +    return;
> +    }
> +    qemu_co_mutex_init(>bl.wps->colock);
>  return;
>  }
>  out:
> @@ -2338,9 +2424,15 @@ static int coroutine_fn raw_co_prw(BlockDriverState 
> *bs,
> uint64_t offset,
>  {
>  BDRVRawState *s = bs->opaque;
>  RawPosixAIOData acb;
> +    int ret;
>  
>  if (fd_open(bs) < 0)
>  return -EIO;
> +#if 

Re: [PATCH v16 7/8] block: add some trace events for new block layer APIs

2023-03-13 Thread Dmitry Fomichev
On Fri, 2023-03-10 at 18:24 +0800, Sam Li wrote:
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 

With one small nit below,

Reviewed-by: Dmitry Fomichev 

> ---
>  block/file-posix.c | 3 +++
>  block/trace-events | 2 ++
>  2 files changed, 5 insertions(+)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 2eceb250f1..563acc76ae 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -3256,6 +3256,7 @@ static int coroutine_fn
> raw_co_zone_report(BlockDriverState *bs, int64_t offset,
>     BlockZoneDescriptor *zones) {
>  BDRVRawState *s = bs->opaque;
>  RawPosixAIOData acb;
> +    trace_zbd_zone_report(bs, *nr_zones, offset >> BDRV_SECTOR_BITS);

The code in this function could be made a bit simpler -

BDRVRawState *s = bs->opaque;
RawPosixAIOData acb = (RawPosixAIOData) {
.bs = bs,
.aio_fildes = s->fd,
.aio_type   = QEMU_AIO_ZONE_REPORT,
.aio_offset = offset,
.zone_report= {  
.nr_zones   = nr_zones,
.zones  = zones,
},
};   

trace_zbd_zone_report(bs, *nr_zones, offset >> BDRV_SECTOR_BITS);
return raw_thread_pool_submit(bs, handle_aiocb_zone_report, );

>  
>  acb = (RawPosixAIOData) {
>  .bs = bs,
> @@ -3334,6 +3335,8 @@ static int coroutine_fn 
> raw_co_zone_mgmt(BlockDriverState
> *bs, BlockZoneOp op,
>  },
>  };
>  
> +    trace_zbd_zone_mgmt(bs, op_name, offset >> BDRV_SECTOR_BITS,
> +    len >> BDRV_SECTOR_BITS);
>  ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, );
>  if (ret != 0) {
>  ret = -errno;
> diff --git a/block/trace-events b/block/trace-events
> index 48dbf10c66..3f4e1d088a 100644
> --- a/block/trace-events
> +++ b/block/trace-events
> @@ -209,6 +209,8 @@ file_FindEjectableOpticalMedia(const char *media) 
> "Matching
> using %s"
>  file_setup_cdrom(const char *partition) "Using %s as optical disc"
>  file_hdev_is_sg(int type, int version) "SG device found: type=%d, version=%d"
>  file_flush_fdatasync_failed(int err) "errno %d"
> +zbd_zone_report(void *bs, unsigned int nr_zones, int64_t sector) "bs %p 
> report
> %d zones starting at sector offset 0x%" PRIx64 ""
> +zbd_zone_mgmt(void *bs, const char *op_name, int64_t sector, int64_t len) "bs
> %p %s starts at sector offset 0x%" PRIx64 " over a range of 0x%" PRIx64 "
> sectors"
>  
>  # ssh.c
>  sftp_error(const char *op, const char *ssh_err, int ssh_err_code, int
> sftp_err_code) "%s failed: %s (libssh error code: %d, sftp error code: %d)"



Re: [PATCH v16 3/8] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls

2023-03-13 Thread Dmitry Fomichev
On Fri, 2023-03-10 at 18:23 +0800, Sam Li wrote:
> Add zoned device option to host_device BlockDriver. It will be presented only
> for zoned host block devices. By adding zone management operations to the
> host_block_device BlockDriver, users can use the new block layer APIs
> including Report Zone and four zone management operations
> (open, close, finish, reset, reset_all).
> 
> Qemu-io uses the new APIs to perform zoned storage commands of the device:
> zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs),
> zone_finish(zf).
> 
> For example, to test zone_report, use following command:
> $ ./build/qemu-io --image-opts -n driver=host_device, filename=/dev/nullb0
> -c "zrp offset nr_zones"
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Hannes Reinecke 
> Reviewed-by: Stefan Hajnoczi 

LGTM,

Reviewed-by: Dmitry Fomichev 

> Acked-by: Kevin Wolf 
> ---
>  block/block-backend.c | 133 +
>  block/file-posix.c    | 309 +-
>  block/io.c    |  41 
>  include/block/block-io.h  |   9 +
>  include/block/block_int-common.h  |  21 ++
>  include/block/raw-aio.h   |   6 +-
>  include/sysemu/block-backend-io.h |  18 ++
>  meson.build   |   4 +
>  qemu-io-cmds.c    | 149 ++
>  9 files changed, 687 insertions(+), 3 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index 278b04ce69..f70b08e3f6 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1806,6 +1806,139 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
>  return ret;
>  }
>  
> +static void coroutine_fn blk_aio_zone_report_entry(void *opaque)
> +{
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = >rwco;
> +
> +    rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
> +   (unsigned int*)acb->bytes,rwco->iobuf);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
> +    unsigned int *nr_zones,
> +    BlockZoneDescriptor  *zones,
> +    BlockCompletionFunc *cb, void *opaque)
> +{
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +    .blk    = blk,
> +    .offset = offset,
> +    .iobuf  = zones,
> +    .ret    = NOT_DONE,
> +    };
> +    acb->bytes = (int64_t)nr_zones,
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
> +    aio_co_enter(blk_get_aio_context(blk), co);
> +
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +    replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +    }
> +
> +    return >common;
> +}
> +
> +static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque)
> +{
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = >rwco;
> +
> +    rwco->ret = blk_co_zone_mgmt(rwco->blk, (BlockZoneOp)rwco->iobuf,
> + rwco->offset, acb->bytes);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> +  int64_t offset, int64_t len,
> +  BlockCompletionFunc *cb, void *opaque) {
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +    .blk    = blk,
> +    .offset = offset,
> +    .iobuf  = (void *)op,
> +    .ret    = NOT_DONE,
> +    };
> +    acb->bytes = len;
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
> +    aio_co_enter(blk_get_aio_context(blk), co);
> +
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +    replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +    }
> +
> +    return >common;
> +}
> +
> +/*
> + * Send a zone_report command.
> + * offset is a byte offset from the start of the device. No alignment
> + * required for offset.
> + * nr_zones represents IN maximum and OUT actual.
> + */
> 

Re: [PATCH v16 1/8] include: add zoned device structs

2023-03-13 Thread Dmitry Fomichev
On Fri, 2023-03-10 at 18:23 +0800, Sam Li wrote:
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 
> Reviewed-by: Damien Le Moal 
> Reviewed-by: Hannes Reinecke 

Looks good to me.

Reviewed-by: Dmitry Fomichev 

> ---
>  include/block/block-common.h | 43 
>  1 file changed, 43 insertions(+)
> 
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index b5122ef8ab..1576fcf2ed 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -75,6 +75,49 @@ typedef struct BlockDriver BlockDriver;
>  typedef struct BdrvChild BdrvChild;
>  typedef struct BdrvChildClass BdrvChildClass;
>  
> +typedef enum BlockZoneOp {
> +    BLK_ZO_OPEN,
> +    BLK_ZO_CLOSE,
> +    BLK_ZO_FINISH,
> +    BLK_ZO_RESET,
> +} BlockZoneOp;
> +
> +typedef enum BlockZoneModel {
> +    BLK_Z_NONE = 0x0, /* Regular block device */
> +    BLK_Z_HM = 0x1, /* Host-managed zoned block device */
> +    BLK_Z_HA = 0x2, /* Host-aware zoned block device */
> +} BlockZoneModel;
> +
> +typedef enum BlockZoneState {
> +    BLK_ZS_NOT_WP = 0x0,
> +    BLK_ZS_EMPTY = 0x1,
> +    BLK_ZS_IOPEN = 0x2,
> +    BLK_ZS_EOPEN = 0x3,
> +    BLK_ZS_CLOSED = 0x4,
> +    BLK_ZS_RDONLY = 0xD,
> +    BLK_ZS_FULL = 0xE,
> +    BLK_ZS_OFFLINE = 0xF,
> +} BlockZoneState;
> +
> +typedef enum BlockZoneType {
> +    BLK_ZT_CONV = 0x1, /* Conventional random writes supported */
> +    BLK_ZT_SWR = 0x2, /* Sequential writes required */
> +    BLK_ZT_SWP = 0x3, /* Sequential writes preferred */
> +} BlockZoneType;
> +
> +/*
> + * Zone descriptor data structure.
> + * Provides information on a zone with all position and size values in bytes.
> + */
> +typedef struct BlockZoneDescriptor {
> +    uint64_t start;
> +    uint64_t length;
> +    uint64_t cap;
> +    uint64_t wp;
> +    BlockZoneType type;
> +    BlockZoneState state;
> +} BlockZoneDescriptor;
> +
>  typedef struct BlockDriverInfo {
>  /* in bytes, 0 if irrelevant */
>  int cluster_size;



Re: [PATCH v12 6/7] qemu-iotests: test new zone operations

2022-10-23 Thread Dmitry Fomichev
On Tue, 2022-10-18 at 16:41 +0800, Sam Li wrote:
> Dmitry Fomichev  于2022年10月17日周一 08:57写道:
> > 
> > On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote:
> > > We have added new block layer APIs of zoned block devices.
> > > Test it with:
> > > Create a null_blk device, run each zone operation on it and see
> > > whether reporting right zone information.
> > 
> > change this to "whether the logs show the correct zone information"?
> > 
> > > 
> > 
> > Could you please describe how to run this specific set of tests
> > in more detail?
> 
> I just run this script on the terminal directly and redirect the
> result to the output file.
> 
> Maybe?
> + Run each zone operation on a newly created null_blk device and see
> + whether the logs show the correct zone information.

Yes, this is fine. And specify the exact command line that you are using
to run the test.

> 
> > 
> > > 
> > > Signed-off-by: Sam Li 
> > > Reviewed-by: Stefan Hajnoczi 
> > > ---
> > >  tests/qemu-iotests/tests/zoned.out | 53 ++
> > >  tests/qemu-iotests/tests/zoned.sh  | 86 ++
> > >  2 files changed, 139 insertions(+)
> > >  create mode 100644 tests/qemu-iotests/tests/zoned.out
> > >  create mode 100755 tests/qemu-iotests/tests/zoned.sh
> > > 
> > > diff --git a/tests/qemu-iotests/tests/zoned.out b/tests/qemu-
> > > iotests/tests/zoned.out
> > > new file mode 100644
> > > index 00..0c8f96deb9
> > > --- /dev/null
> > > +++ b/tests/qemu-iotests/tests/zoned.out
> > > @@ -0,0 +1,53 @@
> > > +QA output created by zoned.sh
> > > +Testing a null_blk device:
> > > +Simple cases: if the operations work
> > > +(1) report the first zone:
> > > +start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:1, [type: 2]
> > > +
> > > +report the first 10 zones
> > > +start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:1, [type: 2]
> > > +start: 0x8, len 0x8, cap 0x8, wptr 0x8, zcond:1, [type: 
> > > 2]
> > > +start: 0x10, len 0x8, cap 0x8, wptr 0x10, zcond:1, [type:
> > > 2]
> > > +start: 0x18, len 0x8, cap 0x8, wptr 0x18, zcond:1, [type:
> > > 2]
> > > +start: 0x20, len 0x8, cap 0x8, wptr 0x20, zcond:1, [type:
> > > 2]
> > > +start: 0x28, len 0x8, cap 0x8, wptr 0x28, zcond:1, [type:
> > > 2]
> > > +start: 0x30, len 0x8, cap 0x8, wptr 0x30, zcond:1, [type:
> > > 2]
> > > +start: 0x38, len 0x8, cap 0x8, wptr 0x38, zcond:1, [type:
> > > 2]
> > > +start: 0x40, len 0x8, cap 0x8, wptr 0x40, zcond:1, [type:
> > > 2]
> > > +start: 0x48, len 0x8, cap 0x8, wptr 0x48, zcond:1, [type:
> > > 2]
> > > +
> > > +report the last zone:
> > > +start: 0x1f38, len 0x8, cap 0x8, wptr 0x1f38, zcond:1,
> > > [type:
> > > 2]
> > > +
> > > +
> > > +(2) opening the first zone
> > > +report after:
> > > +start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:3, [type: 2]
> > > +
> > > +opening the second zone
> > > +report after:
> > > +start: 0x8, len 0x8, cap 0x8, wptr 0x8, zcond:3, [type: 
> > > 2]
> > > +
> > > +opening the last zone
> > > +report after:
> > > +start: 0x1f38, len 0x8, cap 0x8, wptr 0x1f38, zcond:3,
> > > [type:
> > > 2]
> > > +
> > > +
> > > +(3) closing the first zone
> > > +report after:
> > > +start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:1, [type: 2]
> > > +
> > > +closing the last zone
> > > +report after:
> > > +start: 0x1f38, len 0x8, cap 0x8, wptr 0x1f38, zcond:1,
> > > [type:
> > > 2]
> > > +
> > > +
> > > +(4) finishing the second zone
> > > +After finishing a zone:
> > > +start: 0x8, len 0x8, cap 0x8, wptr 0x10, zcond:14, [type:
> > > 2]
> > > +
> > > +
> > > +(5) resetting the second zone
> > > +After resetting a zone:
> > > +start: 0x8, len 0x8, cap 0x8, wptr 0x8, zcond:1, [type: 
> > > 2]
> > > +*** done
> > > diff --git a/tests/qemu-iotests/tests/zoned.sh b/tests/qemu-
> > > iotests/tests/zoned.sh
> > > new file mo

Re: [RFC v3 2/2] virtio-blk: add zoned storage emulation for zoned devices

2022-10-23 Thread Dmitry Fomichev
On Tue, 2022-10-18 at 16:56 +0800, Sam Li wrote:
> Dmitry Fomichev  于2022年10月17日周一 09:01写道:
> > 
> > On Sun, 2022-10-16 at 23:05 +0800, Sam Li wrote:
> > > This patch extends virtio-blk emulation to handle zoned device commands
> > > by calling the new block layer APIs to perform zoned device I/O on
> > > behalf of the guest. It supports Report Zone, four zone oparations (open,
> > > close, finish, reset), and Append Zone.
> > > 
> > > The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> > > support zoned block devices. Regular block devices(conventional zones)
> > > will not be set.
> > > 
> > > Then the guest os can use blkzone(8) to test those commands on zoned
> > > devices.
> > > Furthermore, using zonefs to test zone append write is also supported.
> > > 
> > > Signed-off-by: Sam Li 
> > > ---
> > >  hw/block/virtio-blk-common.c   |   2 +
> > >  hw/block/virtio-blk.c  | 412 -
> > >  include/hw/virtio/virtio-blk.h |  11 +-
> > >  3 files changed, 422 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
> > > index ac52d7c176..e2f8e2f6da 100644
> > > --- a/hw/block/virtio-blk-common.c
> > > +++ b/hw/block/virtio-blk-common.c
> > > @@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = {
> > >   .end = endof(struct virtio_blk_config, discard_sector_alignment)},
> > >  {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
> > >   .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> > > +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> > > + .end = endof(struct virtio_blk_config, zoned)},
> > >  {}
> > >  };
> > > 
> > > diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> > > index 8131ec2dbc..58891aea31 100644
> > > --- a/hw/block/virtio-blk.c
> > > +++ b/hw/block/virtio-blk.c
> > > @@ -26,6 +26,9 @@
> > >  #include "hw/virtio/virtio-blk.h"
> > >  #include "dataplane/virtio-blk.h"
> > >  #include "scsi/constants.h"
> > > +#if defined(CONFIG_BLKZONED)
> > > +#include 
> > > +#endif
> > >  #ifdef __linux__
> > >  # include 
> > >  #endif
> > > @@ -55,10 +58,29 @@ static void virtio_blk_req_complete(VirtIOBlockReq
> > > *req,
> > > unsigned char status)
> > >  {
> > >  VirtIOBlock *s = req->dev;
> > >  VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > > +    int64_t inhdr_len, n;
> > > +    void *buf;
> > > 
> > >  trace_virtio_blk_req_complete(vdev, req, status);
> > > 
> > > -    stb_p(>in->status, status);
> > > +    iov_discard_undo(>inhdr_undo);
> > > +    if (virtio_ldl_p(vdev, >out.type) == VIRTIO_BLK_T_ZONE_APPEND) {
> > > +    inhdr_len = sizeof(struct virtio_blk_zone_append_inhdr);
> > > +    req->in.in_hdr->status = status;
> > > +    buf = req->in.in_hdr;
> > > +    } else {
> > > +    inhdr_len = sizeof(struct virtio_blk_inhdr);
> > > +    req->in.zone_append_inhdr->status = status;
> > > +    buf = req->in.zone_append_inhdr;
> > > +    }
> > > +
> > > +    n = iov_from_buf(req->elem.in_sg, req->elem.in_num,
> > > + req->in_len - inhdr_len, buf, inhdr_len);
> > > +    if (n != inhdr_len) {
> > > +    virtio_error(vdev, "Driver provided input buffer less than size 
> > > of
> > > "
> > > + "in header");
> > > +    }
> > > +
> > >  iov_discard_undo(>inhdr_undo);
> > >  iov_discard_undo(>outhdr_undo);
> > >  virtqueue_push(req->vq, >elem, req->in_len);
> > > @@ -592,6 +614,334 @@ err:
> > >  return err_status;
> > >  }
> > > 
> > > +typedef struct ZoneCmdData {
> > > +    VirtIOBlockReq *req;
> > > +    union {
> > > +    struct {
> > > +    unsigned int nr_zones;
> > > +    BlockZoneDescriptor *zones;
> > > +    } zone_report_data;
> > > +    struct {
> > > +    int64_t offset;
> > > +    } zone_append_data;
> > > +    };
> > > +} ZoneCmdData;
> > > +
> > > +/*
> > > + * check zoned_request:

Re: [PATCH v4 2/3] block: introduce zone append write for zoned devices

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:56 +0800, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of writes is pointing
> to the write pointer of that zone. Upon completion the device will
> respond with the position the data has been written in the zone.
> 
> Signed-off-by: Sam Li 
> ---
>  block/block-backend.c | 65 ++
>  block/file-posix.c    | 89 +--
>  block/io.c    | 21 
>  block/raw-format.c    |  8 +++
>  include/block/block-io.h  |  3 ++
>  include/block/block_int-common.h  |  5 ++
>  include/block/raw-aio.h   |  4 +-
>  include/sysemu/block-backend-io.h |  9 
>  8 files changed, 198 insertions(+), 6 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index 1c618e9c68..06931ddd24 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>  struct {
>  unsigned long op;
>  } zone_mgmt;
> +    struct {
> +    int64_t *append_sector;
> +    } zone_append;
>  };
>  } BlkRwCo;
>  
> @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk,
> BlockZoneOp op,
>  return >common;
>  }
>  
> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
> +{
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = >rwco;
> +
> +    rwco->ret = blk_co_zone_append(rwco->blk, 
> rwco->zone_append.append_sector,
> +   rwco->iobuf, rwco->flags);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +    QEMUIOVector *qiov, BdrvRequestFlags flags,
> +    BlockCompletionFunc *cb, void *opaque) {
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +    .blk    = blk,
> +    .ret    = NOT_DONE,
> +    .flags  = flags,
> +    .iobuf  = qiov,
> +    .zone_append = {
> +    .append_sector = offset,
> +    },
> +    };
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +    bdrv_coroutine_enter(blk_bs(blk), co);
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +    replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +    }
> +
> +    return >common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk,
> BlockZoneOp op,
>  return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +    QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +    if (!blk_is_available(blk)) {
> +    blk_dec_in_flight(blk);
> +    return -ENOMEDIUM;
> +    }
> +
> +    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>  BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 5ff5500301..3d0cc33d02 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData {
>  struct {
>  struct iovec *iov;
>  int niov;
> +    int64_t *offset;
>  } io;
>  struct {
>  uint64_t cmd;
> @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs,
> Error **errp)
>  bs->bl.max_active_zones = ret;
>  }
>  
> +    ret = get_sysfs_long_val(, "physical_block_size");
> +    if (ret >= 0) {
> +    bs->bl.write_granularity = ret;
> +    }
> +
>  bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
>  if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
>  error_report("report wps failed");
> @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int
> nr_iov, off_t offset)
>  static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
>  {
>  ssize_t len;
> +    BlockZoneWps *wps = aiocb->bs->bl.wps;
> +    int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;

Can this code ever be called for a non-zoned device with 0 zone size?
If yes, you need to avoid division by zero here...

> +
> +    if 

Re: [RFC v3 2/2] virtio-blk: add zoned storage emulation for zoned devices

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 23:05 +0800, Sam Li wrote:
> This patch extends virtio-blk emulation to handle zoned device commands
> by calling the new block layer APIs to perform zoned device I/O on
> behalf of the guest. It supports Report Zone, four zone oparations (open,
> close, finish, reset), and Append Zone.
> 
> The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> support zoned block devices. Regular block devices(conventional zones)
> will not be set.
> 
> Then the guest os can use blkzone(8) to test those commands on zoned devices.
> Furthermore, using zonefs to test zone append write is also supported.
> 
> Signed-off-by: Sam Li 
> ---
>  hw/block/virtio-blk-common.c   |   2 +
>  hw/block/virtio-blk.c  | 412 -
>  include/hw/virtio/virtio-blk.h |  11 +-
>  3 files changed, 422 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
> index ac52d7c176..e2f8e2f6da 100644
> --- a/hw/block/virtio-blk-common.c
> +++ b/hw/block/virtio-blk-common.c
> @@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = {
>   .end = endof(struct virtio_blk_config, discard_sector_alignment)},
>  {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
>   .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> + .end = endof(struct virtio_blk_config, zoned)},
>  {}
>  };
>  
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index 8131ec2dbc..58891aea31 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -26,6 +26,9 @@
>  #include "hw/virtio/virtio-blk.h"
>  #include "dataplane/virtio-blk.h"
>  #include "scsi/constants.h"
> +#if defined(CONFIG_BLKZONED)
> +#include 
> +#endif
>  #ifdef __linux__
>  # include 
>  #endif
> @@ -55,10 +58,29 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req,
> unsigned char status)
>  {
>  VirtIOBlock *s = req->dev;
>  VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    int64_t inhdr_len, n;
> +    void *buf;
>  
>  trace_virtio_blk_req_complete(vdev, req, status);
>  
> -    stb_p(>in->status, status);
> +    iov_discard_undo(>inhdr_undo);
> +    if (virtio_ldl_p(vdev, >out.type) == VIRTIO_BLK_T_ZONE_APPEND) {
> +    inhdr_len = sizeof(struct virtio_blk_zone_append_inhdr);
> +    req->in.in_hdr->status = status;
> +    buf = req->in.in_hdr;
> +    } else {
> +    inhdr_len = sizeof(struct virtio_blk_inhdr);
> +    req->in.zone_append_inhdr->status = status;
> +    buf = req->in.zone_append_inhdr;
> +    }
> +
> +    n = iov_from_buf(req->elem.in_sg, req->elem.in_num,
> + req->in_len - inhdr_len, buf, inhdr_len);
> +    if (n != inhdr_len) {
> +    virtio_error(vdev, "Driver provided input buffer less than size of "
> + "in header");
> +    }
> +
>  iov_discard_undo(>inhdr_undo);
>  iov_discard_undo(>outhdr_undo);
>  virtqueue_push(req->vq, >elem, req->in_len);
> @@ -592,6 +614,334 @@ err:
>  return err_status;
>  }
>  
> +typedef struct ZoneCmdData {
> +    VirtIOBlockReq *req;
> +    union {
> +    struct {
> +    unsigned int nr_zones;
> +    BlockZoneDescriptor *zones;
> +    } zone_report_data;
> +    struct {
> +    int64_t offset;
> +    } zone_append_data;
> +    };
> +} ZoneCmdData;
> +
> +/*
> + * check zoned_request: error checking before issuing requests. If all checks
> + * passed, return true.
> + * append: true if only zone append requests issued.
> + */
> +static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
> + bool append, uint8_t *status) {
> +    BlockDriverState *bs = blk_bs(s->blk);
> +    int index = offset / bs->bl.zone_size;
> +
> +    if (offset < 0 || len < 0 || offset > bs->bl.capacity - len) {
> +    *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +    return false;
> +    }
> +
> +    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
> +    *status = VIRTIO_BLK_S_UNSUPP;
> +    return false;
> +    }
> +
> +    if (append) {
> +    if ((offset % bs->bl.write_granularity) != 0) {
> +    *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
> +    return false;
> +    }
> +
> +    if (BDRV_ZT_IS_CONV(bs->bl.wps->wp[index])) {
> +    *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +    return false;
> +    }
> +
> +    if (len / 512 > bs->bl.max_append_sectors) {
> +    if (bs->bl.max_append_sectors == 0) {
> +    *status = VIRTIO_BLK_S_UNSUPP;
> +    } else {
> +    *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +    }
> +    return false;
> +    }
> +    }
> +    return true;
> +}
> +
> +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> +{
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = 

Re: [PATCH v4 1/3] file-posix: add the tracking of the zones write pointers

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:56 +0800, Sam Li wrote:
> Since Linux doesn't have a user API to issue zone append operations to
> zoned devices from user space, the file-posix driver is modified to add
> zone append emulation using regular writes. To do this, the file-posix
> driver tracks the wp location of all zones of the device. It uses an
> array of uint64_t. The most significant bit of each wp location indicates
> if the zone type is conventional zones.
> 
> The zones wp can be changed due to the following operations issued:
> - zone reset: change the wp to the start offset of that zone
> - zone finish: change to the end location of that zone
> - write to a zone
> - zone append
> 
> Signed-off-by: Sam Li 
> ---
>  block/file-posix.c   | 144 +++
>  include/block/block-common.h |  14 +++
>  include/block/block_int-common.h |   3 +
>  3 files changed, 161 insertions(+)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 7c5a330fc1..5ff5500301 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1324,6 +1324,66 @@ static int hdev_get_max_segments(int fd, struct stat
> *st)
>  #endif
>  }
>  
> +#if defined(CONFIG_BLKZONED)
> +static int get_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +    unsigned int nrz) {
> +    struct blk_zone *blkz;
> +    int64_t rep_size;

size_t

> +    int64_t sector = offset >> BDRV_SECTOR_BITS;

uint64_t

> +    int ret, n = 0, i = 0;
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct 
> blk_zone);
> +    g_autofree struct blk_zone_report *rep = NULL;
> +
> +    rep = g_malloc(rep_size);
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +    memset(rep, 0, rep_size);
> +    rep->sector = sector;
> +    rep->nr_zones = nrz - n;
> +
> +    do {
> +    ret = ioctl(fd, BLKREPORTZONE, rep);
> +    } while (ret != 0 && errno == EINTR);
> +    if (ret != 0) {
> +    error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> +    fd, offset, errno);
> +    return -errno;
> +    }
> +
> +    if (!rep->nr_zones) {
> +    break;
> +    }
> +
> +    for (i = 0; i < rep->nr_zones; i++, n++) {
> +    /*
> + * The wp tracking cares only about sequential writes required 
> and
> + * sequential write preferred zones so that the wp can advance to
> + * the right location.
> + * Use the most significant bit of the wp location to indicate 
> the
> + * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
> + */
> +    if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
> +    wps->wp[i] = 1ULL << 63;
> +    } else {
> +    wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> +    }
> +    }
> +    sector = blkz[i - 1].start + blkz[i - 1].len;
> +    }
> +
> +    return 0;
> +}
> +
> +static void update_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +    unsigned int nrz) {
> +    qemu_mutex_lock(>lock);
> +    if (get_zones_wp(fd, wps, offset, nrz) < 0) {
> +    error_report("update zone wp failed");
> +    }
> +    qemu_mutex_unlock(>lock);
> +}
> +#endif
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>  BDRVRawState *s = bs->opaque;
> @@ -1414,6 +1474,14 @@ static void raw_refresh_limits(BlockDriverState *bs,
> Error **errp)
>  if (ret >= 0) {
>  bs->bl.max_active_zones = ret;
>  }
> +
> +    bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
> +    if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
> +    error_report("report wps failed");
> +    g_free(bs->bl.wps);
> +    return;
> +    }
> +    qemu_mutex_init(>bl.wps->lock);
>  }
>  }
>  
> @@ -1725,6 +1793,25 @@ static int handle_aiocb_rw(void *opaque)
>  
>  out:
>  if (nbytes == aiocb->aio_nbytes) {
> +#if defined(CONFIG_BLKZONED)
> +    if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +    BlockZoneWps *wps = aiocb->bs->bl.wps;
> +    int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> +    if (wps) {

In my testing, I get a divide by zero exception in the "index"
calculation above. Changing this part as follows

-int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
-if (wps) {
+if (wps && aiocb->bs->bl.zone_size) {
+int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
+

fixes the crash.

> +    qemu_mutex_lock(>lock);
> +    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> +    uint64_t wend_offset =
> +    aiocb->aio_offset + aiocb->aio_nbytes;
> +
> +    /* Advance the wp if needed */
> +    if (wend_offset > wps->wp[index]) {
> +  

Re: [PATCH v12 6/7] qemu-iotests: test new zone operations

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote:
> We have added new block layer APIs of zoned block devices.
> Test it with:
> Create a null_blk device, run each zone operation on it and see
> whether reporting right zone information.

change this to "whether the logs show the correct zone information"?

> 

Could you please describe how to run this specific set of tests
in more detail?
 
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 
> ---
>  tests/qemu-iotests/tests/zoned.out | 53 ++
>  tests/qemu-iotests/tests/zoned.sh  | 86 ++
>  2 files changed, 139 insertions(+)
>  create mode 100644 tests/qemu-iotests/tests/zoned.out
>  create mode 100755 tests/qemu-iotests/tests/zoned.sh
> 
> diff --git a/tests/qemu-iotests/tests/zoned.out b/tests/qemu-
> iotests/tests/zoned.out
> new file mode 100644
> index 00..0c8f96deb9
> --- /dev/null
> +++ b/tests/qemu-iotests/tests/zoned.out
> @@ -0,0 +1,53 @@
> +QA output created by zoned.sh
> +Testing a null_blk device:
> +Simple cases: if the operations work
> +(1) report the first zone:
> +start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:1, [type: 2]
> +
> +report the first 10 zones
> +start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:1, [type: 2]
> +start: 0x8, len 0x8, cap 0x8, wptr 0x8, zcond:1, [type: 2]
> +start: 0x10, len 0x8, cap 0x8, wptr 0x10, zcond:1, [type: 2]
> +start: 0x18, len 0x8, cap 0x8, wptr 0x18, zcond:1, [type: 2]
> +start: 0x20, len 0x8, cap 0x8, wptr 0x20, zcond:1, [type: 2]
> +start: 0x28, len 0x8, cap 0x8, wptr 0x28, zcond:1, [type: 2]
> +start: 0x30, len 0x8, cap 0x8, wptr 0x30, zcond:1, [type: 2]
> +start: 0x38, len 0x8, cap 0x8, wptr 0x38, zcond:1, [type: 2]
> +start: 0x40, len 0x8, cap 0x8, wptr 0x40, zcond:1, [type: 2]
> +start: 0x48, len 0x8, cap 0x8, wptr 0x48, zcond:1, [type: 2]
> +
> +report the last zone:
> +start: 0x1f38, len 0x8, cap 0x8, wptr 0x1f38, zcond:1, [type:
> 2]
> +
> +
> +(2) opening the first zone
> +report after:
> +start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:3, [type: 2]
> +
> +opening the second zone
> +report after:
> +start: 0x8, len 0x8, cap 0x8, wptr 0x8, zcond:3, [type: 2]
> +
> +opening the last zone
> +report after:
> +start: 0x1f38, len 0x8, cap 0x8, wptr 0x1f38, zcond:3, [type:
> 2]
> +
> +
> +(3) closing the first zone
> +report after:
> +start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:1, [type: 2]
> +
> +closing the last zone
> +report after:
> +start: 0x1f38, len 0x8, cap 0x8, wptr 0x1f38, zcond:1, [type:
> 2]
> +
> +
> +(4) finishing the second zone
> +After finishing a zone:
> +start: 0x8, len 0x8, cap 0x8, wptr 0x10, zcond:14, [type: 2]
> +
> +
> +(5) resetting the second zone
> +After resetting a zone:
> +start: 0x8, len 0x8, cap 0x8, wptr 0x8, zcond:1, [type: 2]
> +*** done
> diff --git a/tests/qemu-iotests/tests/zoned.sh b/tests/qemu-
> iotests/tests/zoned.sh
> new file mode 100755
> index 00..fced0194c5
> --- /dev/null
> +++ b/tests/qemu-iotests/tests/zoned.sh
> @@ -0,0 +1,86 @@
> +#!/usr/bin/env bash
> +#
> +# Test zone management operations.
> +#
> +
> +seq="$(basename $0)"
> +echo "QA output created by $seq"
> +status=1 # failure is the default!
> +
> +_cleanup()
> +{
> +  _cleanup_test_img
> +  sudo rmmod null_blk
> +}
> +trap "_cleanup; exit \$status" 0 1 2 3 15
> +
> +# get standard environment, filters and checks
> +. ./common.rc
> +. ./common.filter
> +. ./common.qemu
> +
> +# This test only runs on Linux hosts with raw image files.
> +_supported_fmt raw
> +_supported_proto file
> +_supported_os Linux
> +
> +QEMU_IO="build/qemu-io"
> +IMG="--image-opts -n driver=zoned_host_device,filename=/dev/nullb0"
> +QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
> +
> +echo "Testing a null_blk device:"
> +echo "case 1: if the operations work"
> +sudo modprobe null_blk nr_devices=1 zoned=1
> +
> +echo "(1) report the first zone:"
> +sudo $QEMU_IO $IMG -c "zrp 0 1"
> +echo
> +echo "report the first 10 zones"
> +sudo $QEMU_IO $IMG -c "zrp 0 10"
> +echo
> +echo "report the last zone:"
> +sudo $QEMU_IO $IMG -c "zrp 0x3e7000 2" # 0x3e7000 / 512 = 0x1f38
> +echo
> +echo
> +echo "(2) opening the first zone"
> +sudo $QEMU_IO $IMG -c "zo 0 268435456"  # 268435456 / 512 = 524288
> +echo "report after:"
> +sudo $QEMU_IO $IMG -c "zrp 0 1"
> +echo
> +echo "opening the second zone"
> +sudo $QEMU_IO $IMG -c "zo 268435456 268435456" #
> +echo "report after:"
> +sudo $QEMU_IO $IMG -c "zrp 268435456 1"
> +echo
> +echo "opening the last zone"
> +sudo $QEMU_IO $IMG -c "zo 0x3e7000 268435456"
> +echo "report after:"
> +sudo $QEMU_IO $IMG -c "zrp 0x3e7000 2"
> +echo
> +echo
> +echo "(3) closing the first zone"
> +sudo $QEMU_IO $IMG -c "zc 0 268435456"
> +echo "report after:"
> +sudo 

Re: [PATCH v12 5/7] config: add check to block layer

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote:
> Putting zoned/non-zoned BlockDrivers on top of each other is not
> allowed.
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 
> Reviewed-by: Hannes Reinecke 

Reviewed-by: Dmitry Fomichev 

> ---
>  block.c  | 19 +++
>  block/file-posix.c   | 12 
>  block/raw-format.c   |  1 +
>  include/block/block_int-common.h |  5 +
>  4 files changed, 37 insertions(+)
> 
> diff --git a/block.c b/block.c
> index 1fbf6b9e69..5d6fa4a25a 100644
> --- a/block.c
> +++ b/block.c
> @@ -7951,6 +7951,25 @@ void bdrv_add_child(BlockDriverState *parent_bs,
> BlockDriverState *child_bs,
>  return;
>  }
>  
> +    /*
> + * Non-zoned block drivers do not follow zoned storage constraints
> + * (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
> + * drivers in a graph.
> + */
> +    if (!parent_bs->drv->supports_zoned_children &&
> +    child_bs->bl.zoned == BLK_Z_HM) {
> +    /*
> + * The host-aware model allows zoned storage constraints and random
> + * write. Allow mixing host-aware and non-zoned drivers. Using
> + * host-aware device as a regular device.
> + */
> +    error_setg(errp, "Cannot add a %s child to a %s parent",
> +   child_bs->bl.zoned == BLK_Z_HM ? "zoned" : "non-zoned",
> +   parent_bs->drv->supports_zoned_children ?
> +   "support zoned children" : "not support zoned children");
> +    return;
> +    }
> +
>  if (!QLIST_EMPTY(_bs->parents)) {
>  error_setg(errp, "The node %s already has a parent",
>     child_bs->node_name);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index bd28e3eaea..7c5a330fc1 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -776,6 +776,18 @@ static int raw_open_common(BlockDriverState *bs, QDict
> *options,
>  goto fail;
>  }
>  }
> +#ifdef CONFIG_BLKZONED
> +    /*
> + * The kernel page cache does not reliably work for writes to SWR zones
> + * of zoned block device because it can not guarantee the order of 
> writes.
> + */
> +    if ((strcmp(bs->drv->format_name, "zoned_host_device") == 0) &&
> +    (!(s->open_flags & O_DIRECT))) {
> +    error_setg(errp, "driver=zoned_host_device was specified, but it "
> +   "requires cache.direct=on, which was not specified.");
> +    return -EINVAL; /* No host kernel page cache */
> +    }
> +#endif
>  
>  if (S_ISBLK(st.st_mode)) {
>  #ifdef __linux__
> diff --git a/block/raw-format.c b/block/raw-format.c
> index bac43f1d25..18dc52a150 100644
> --- a/block/raw-format.c
> +++ b/block/raw-format.c
> @@ -615,6 +615,7 @@ static void raw_child_perm(BlockDriverState *bs, BdrvChild
> *c,
>  BlockDriver bdrv_raw = {
>  .format_name  = "raw",
>  .instance_size    = sizeof(BDRVRawState),
> +    .supports_zoned_children = true,
>  .bdrv_probe   = _probe,
>  .bdrv_reopen_prepare  = _reopen_prepare,
>  .bdrv_reopen_commit   = _reopen_commit,
> diff --git a/include/block/block_int-common.h b/include/block/block_int-
> common.h
> index cdc06e77a6..37dddc603c 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -127,6 +127,11 @@ struct BlockDriver {
>   */
>  bool is_format;
>  
> +    /*
> + * Set to true if the BlockDriver supports zoned children.
> + */
> +    bool supports_zoned_children;
> +
>  /*
>   * Drivers not implementing bdrv_parse_filename nor bdrv_open should have
>   * this field set to true, except ones that are defined only by their



Re: [PATCH v12 4/7] raw-format: add zone operations to pass through requests

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote:
> raw-format driver usually sits on top of file-posix driver. It needs to
> pass through requests of zone commands.
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 
> Reviewed-by: Damien Le Moal 
> Reviewed-by: Hannes Reinecke 

Reviewed-by: Dmitry Fomichev 

> ---
>  block/raw-format.c | 13 +
>  1 file changed, 13 insertions(+)
> 
> diff --git a/block/raw-format.c b/block/raw-format.c
> index f337ac7569..bac43f1d25 100644
> --- a/block/raw-format.c
> +++ b/block/raw-format.c
> @@ -314,6 +314,17 @@ static int coroutine_fn raw_co_pdiscard(BlockDriverState
> *bs,
>  return bdrv_co_pdiscard(bs->file, offset, bytes);
>  }
>  
> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t
> offset,
> +   unsigned int *nr_zones,
> +   BlockZoneDescriptor *zones) {
> +    return bdrv_co_zone_report(bs->file->bs, offset, nr_zones, zones);
> +}
> +
> +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp 
> op,
> + int64_t offset, int64_t len) {
> +    return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
> +}
> +
>  static int64_t raw_getlength(BlockDriverState *bs)
>  {
>  int64_t len;
> @@ -615,6 +626,8 @@ BlockDriver bdrv_raw = {
>  .bdrv_co_pwritev  = _co_pwritev,
>  .bdrv_co_pwrite_zeroes = _co_pwrite_zeroes,
>  .bdrv_co_pdiscard = _co_pdiscard,
> +    .bdrv_co_zone_report  = _co_zone_report,
> +    .bdrv_co_zone_mgmt  = _co_zone_mgmt,
>  .bdrv_co_block_status = _co_block_status,
>  .bdrv_co_copy_range_from = _co_copy_range_from,
>  .bdrv_co_copy_range_to  = _co_copy_range_to,



Re: [PATCH v12 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote:
> Add a new zoned_host_device BlockDriver. The zoned_host_device option
> accepts only zoned host block devices. By adding zone management
> operations in this new BlockDriver, users can use the new block
> layer APIs including Report Zone and four zone management operations
> (open, close, finish, reset, reset_all).
> 
> Qemu-io uses the new APIs to perform zoned storage commands of the device:
> zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs),
> zone_finish(zf).
> 
> For example, to test zone_report, use following command:
> $ ./build/qemu-io --image-opts -n driver=zoned_host_device,
> filename=/dev/nullb0
> -c "zrp offset nr_zones"
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Hannes Reinecke 
> ---
>  block/block-backend.c | 148 +
>  block/file-posix.c    | 335 ++
>  block/io.c    |  41 
>  include/block/block-io.h  |   7 +
>  include/block/block_int-common.h  |  24 +++
>  include/block/raw-aio.h   |   6 +-
>  include/sysemu/block-backend-io.h |  18 ++
>  meson.build   |   4 +
>  qapi/block-core.json  |   8 +-
>  qemu-io-cmds.c    | 149 +
>  10 files changed, 737 insertions(+), 3 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index aa4adf06ae..1c618e9c68 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo {
>  void *iobuf;
>  int ret;
>  BdrvRequestFlags flags;
> +    union {
> +    struct {
> +    unsigned int *nr_zones;
> +    BlockZoneDescriptor *zones;
> +    } zone_report;
> +    struct {
> +    unsigned long op;
> +    } zone_mgmt;
> +    };
>  } BlkRwCo;
>  
>  int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
> @@ -1775,6 +1784,145 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
>  return ret;
>  }
>  
> +static void coroutine_fn blk_aio_zone_report_entry(void *opaque)
> +{
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = >rwco;
> +
> +    rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
> +   rwco->zone_report.nr_zones,
> +   rwco->zone_report.zones);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
> +    unsigned int *nr_zones,
> +    BlockZoneDescriptor  *zones,
> +    BlockCompletionFunc *cb, void *opaque)
> +{
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +    .blk    = blk,
> +    .offset = offset,
> +    .ret    = NOT_DONE,
> +    .zone_report = {
> +    .zones = zones,
> +    .nr_zones = nr_zones,
> +    },
> +    };
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
> +    bdrv_coroutine_enter(blk_bs(blk), co);
> +
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +    replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +    }
> +
> +    return >common;
> +}
> +
> +static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque)
> +{
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = >rwco;
> +
> +    rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op,
> + rwco->offset, acb->bytes);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> +  int64_t offset, int64_t len,
> +  BlockCompletionFunc *cb, void *opaque) {
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +    .blk    = blk,
> +    .offset = offset,
> +    .ret    = NOT_DONE,
> +    .zone_mgmt = {
> +    .op = op,
> +    },
> +    };
> +    acb->bytes = len;
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
> +    bdrv_coroutine_enter(blk_bs(blk), co);
> +
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +    replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +    }
> +
> +    return >common;
> +}
> +
> +/*
> + * Send a zone_report command.
> + * offset is a byte offset from the start of the device. No alignment
> + * required for offset.
> + * nr_zones represents IN maximum and OUT actual.

Re: [PATCH v12 1/7] include: add zoned device structs

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote:
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 
> Reviewed-by: Damien Le Moal 
> Reviewed-by: Hannes Reinecke 
> ---
>  include/block/block-common.h | 43 
>  1 file changed, 43 insertions(+)
> 
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index fdb7306e78..36bd0e480e 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -49,6 +49,49 @@ typedef struct BlockDriver BlockDriver;
>  typedef struct BdrvChild BdrvChild;
>  typedef struct BdrvChildClass BdrvChildClass;
>  
> +typedef enum BlockZoneOp {
> +    BLK_ZO_OPEN,
> +    BLK_ZO_CLOSE,
> +    BLK_ZO_FINISH,
> +    BLK_ZO_RESET,
> +} BlockZoneOp;
> +
> +typedef enum BlockZoneModel {
> +    BLK_Z_NONE = 0x0, /* Regular block device */
> +    BLK_Z_HM = 0x1, /* Host-managed zoned block device */
> +    BLK_Z_HA = 0x2, /* Host-aware zoned block device */
> +} BlockZoneModel;
> +
> +typedef enum BlockZoneCondition {
> +    BLK_ZS_NOT_WP = 0x0,
> +    BLK_ZS_EMPTY = 0x1,
> +    BLK_ZS_IOPEN = 0x2,
> +    BLK_ZS_EOPEN = 0x3,
> +    BLK_ZS_CLOSED = 0x4,
> +    BLK_ZS_RDONLY = 0xD,
> +    BLK_ZS_FULL = 0xE,
> +    BLK_ZS_OFFLINE = 0xF,
> +} BlockZoneCondition;

The virtio-zbd specification doesn't define conditions, it uses the term
"state" instead, similar to ZNS. Please rename BlockZoneCondition to
BlockZoneState to follow the spec terminology.

> +
> +typedef enum BlockZoneType {
> +    BLK_ZT_CONV = 0x1, /* Conventional random writes supported */
> +    BLK_ZT_SWR = 0x2, /* Sequential writes required */
> +    BLK_ZT_SWP = 0x3, /* Sequential writes preferred */
> +} BlockZoneType;
> +
> +/*
> + * Zone descriptor data structure.
> + * Provides information on a zone with all position and size values in bytes.
> + */
> +typedef struct BlockZoneDescriptor {
> +    uint64_t start;
> +    uint64_t length;
> +    uint64_t cap;
> +    uint64_t wp;
> +    BlockZoneType type;
> +    BlockZoneCondition cond;

BlockZoneState state;

> +} BlockZoneDescriptor;
> +
>  typedef struct BlockDriverInfo {
>  /* in bytes, 0 if irrelevant */
>  int cluster_size;



Re: [PATCH v12 7/7] docs/zoned-storage: add zoned device documentation

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote:
> Add the documentation about the zoned device support to virtio-blk
> emulation.
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 
> Reviewed-by: Damien Le Moal 
> ---
>  docs/devel/zoned-storage.rst   | 43 ++
>  docs/system/qemu-block-drivers.rst.inc |  6 
>  2 files changed, 49 insertions(+)
>  create mode 100644 docs/devel/zoned-storage.rst
> 
> diff --git a/docs/devel/zoned-storage.rst b/docs/devel/zoned-storage.rst
> new file mode 100644
> index 00..cf169d029b
> --- /dev/null
> +++ b/docs/devel/zoned-storage.rst
> @@ -0,0 +1,43 @@
> +=
> +zoned-storage
> +=
> +
> +Zoned Block Devices (ZBDs) divide the LBA space into block regions called
> zones
> +that are larger than the LBA size. They can only allow sequential writes,
> which
> +can reduce write amplification in SSDs, and potentially lead to higher
> +throughput and increased capacity. More details about ZBDs can be found at:
> +
> +https://zonedstorage.io/docs/introduction/zoned-storage
> +
> +1. Block layer APIs for zoned storage
> +-
> +QEMU block layer has three zoned storage model:

replace it with

+QEMU block layer supports three zoned storage models:

? with this nit,

Reviewed-by: Dmitry Fomichev 

> +- BLK_Z_HM: The host-managed zoned model only allows sequential writes access
> +to zones. It supports ZBD-specific I/O commands that can be used by a host to
> +manage the zones of a device.
> +- BLK_Z_HA: The host-aware zoned model allows random write operations in
> +zones, making it backward compatible with regular block devices.
> +- BLK_Z_NONE: The non-zoned model has no zones support. It includes both
> +regular and drive-managed ZBD devices. ZBD-specific I/O commands are not
> +supported.
> +
> +The block device information resides inside BlockDriverState. QEMU uses
> +BlockLimits struct(BlockDriverState::bl) that is continuously accessed by the
> +block layer while processing I/O requests. A BlockBackend has a root pointer
> to
> +a BlockDriverState graph(for example, raw format on top of file-posix). The
> +zoned storage information can be propagated from the leaf BlockDriverState 
> all
> +the way up to the BlockBackend. If the zoned storage model in file-posix is
> +set to BLK_Z_HM, then block drivers will declare support for zoned host
> device.
> +
> +The block layer APIs support commands needed for zoned storage devices,
> +including report zones, four zone operations, and zone append.
> +
> +2. Emulating zoned storage controllers
> +--
> +When the BlockBackend's BlockLimits model reports a zoned storage device,
> users
> +like the virtio-blk emulation or the qemu-io-cmds.c utility can use block
> layer
> +APIs for zoned storage emulation or testing.
> +
> +For example, to test zone_report on a null_blk device using qemu-io is:
> +$ path/to/qemu-io --image-opts -n
> driver=zoned_host_device,filename=/dev/nullb0
> +-c "zrp offset nr_zones"
> diff --git a/docs/system/qemu-block-drivers.rst.inc b/docs/system/qemu-block-
> drivers.rst.inc
> index dfe5d2293d..0b97227fd9 100644
> --- a/docs/system/qemu-block-drivers.rst.inc
> +++ b/docs/system/qemu-block-drivers.rst.inc
> @@ -430,6 +430,12 @@ Hard disks
>    you may corrupt your host data (use the ``-snapshot`` command
>    line option or modify the device permissions accordingly).
>  
> +Zoned block devices
> +  Zoned block devices can be passed through to the guest if the emulated
> storage
> +  controller supports zoned storage. Use ``--blockdev zoned_host_device,
> +  node-name=drive0,filename=/dev/nullb0`` to pass through ``/dev/nullb0``
> +  as ``drive0``.
> +
>  Windows
>  ^^^
>  



Re: [RFC v3 1/2] include: update virtio_blk headers from Linux 5.19-rc2+

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 23:05 +0800, Sam Li wrote:
> Use scripts/update-linux-headers.sh to update virtio-blk headers
> from Dmitry's "virtio-blk:add support for zoned block devices"
> linux patch. There is a link for more information:
> https://github.com/dmitry-fomichev/virtblk-zbd
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 
> Signed-off-by: Sam Li 

the duplicate sign-off is not needed. With this,

Reviewed-by: Dmitry Fomichev 

> ---
>  include/standard-headers/linux/virtio_blk.h | 109 
>  1 file changed, 109 insertions(+)
> 
> diff --git a/include/standard-headers/linux/virtio_blk.h b/include/standard-
> headers/linux/virtio_blk.h
> index 2dcc90826a..490bd21c76 100644
> --- a/include/standard-headers/linux/virtio_blk.h
> +++ b/include/standard-headers/linux/virtio_blk.h
> @@ -40,6 +40,7 @@
>  #define VIRTIO_BLK_F_MQ12  /* support more than one vq */
>  #define VIRTIO_BLK_F_DISCARD   13  /* DISCARD is supported */
>  #define VIRTIO_BLK_F_WRITE_ZEROES  14  /* WRITE ZEROES is supported 
> */
> +#define VIRTIO_BLK_F_ZONED 17  /* Zoned block device */
>  
>  /* Legacy feature bits */
>  #ifndef VIRTIO_BLK_NO_LEGACY
> @@ -119,6 +120,20 @@ struct virtio_blk_config {
> uint8_t write_zeroes_may_unmap;
>  
> uint8_t unused1[3];
> +
> +   /* Secure erase fields that are defined in the virtio spec */
> +   uint8_t sec_erase[12];
> +
> +   /* Zoned block device characteristics (if VIRTIO_BLK_F_ZONED) */
> +   struct virtio_blk_zoned_characteristics {
> +   __virtio32 zone_sectors;
> +   __virtio32 max_open_zones;
> +   __virtio32 max_active_zones;
> +   __virtio32 max_append_sectors;
> +   __virtio32 write_granularity;
> +   uint8_t model;
> +   uint8_t unused2[3];
> +   } zoned;
>  } QEMU_PACKED;
>  
>  /*
> @@ -153,6 +168,27 @@ struct virtio_blk_config {
>  /* Write zeroes command */
>  #define VIRTIO_BLK_T_WRITE_ZEROES  13
>  
> +/* Zone append command */
> +#define VIRTIO_BLK_T_ZONE_APPEND    15
> +
> +/* Report zones command */
> +#define VIRTIO_BLK_T_ZONE_REPORT    16
> +
> +/* Open zone command */
> +#define VIRTIO_BLK_T_ZONE_OPEN  18
> +
> +/* Close zone command */
> +#define VIRTIO_BLK_T_ZONE_CLOSE 20
> +
> +/* Finish zone command */
> +#define VIRTIO_BLK_T_ZONE_FINISH    22
> +
> +/* Reset zone command */
> +#define VIRTIO_BLK_T_ZONE_RESET 24
> +
> +/* Reset All zones command */
> +#define VIRTIO_BLK_T_ZONE_RESET_ALL 26
> +
>  #ifndef VIRTIO_BLK_NO_LEGACY
>  /* Barrier before this op. */
>  #define VIRTIO_BLK_T_BARRIER   0x8000
> @@ -172,6 +208,72 @@ struct virtio_blk_outhdr {
> __virtio64 sector;
>  };
>  
> +/*
> + * Supported zoned device models.
> + */
> +
> +/* Regular block device */
> +#define VIRTIO_BLK_Z_NONE  0
> +/* Host-managed zoned device */
> +#define VIRTIO_BLK_Z_HM    1
> +/* Host-aware zoned device */
> +#define VIRTIO_BLK_Z_HA    2
> +
> +/*
> + * Zone descriptor. A part of VIRTIO_BLK_T_ZONE_REPORT command reply.
> + */
> +struct virtio_blk_zone_descriptor {
> +   /* Zone capacity */
> +   __virtio64 z_cap;
> +   /* The starting sector of the zone */
> +   __virtio64 z_start;
> +   /* Zone write pointer position in sectors */
> +   __virtio64 z_wp;
> +   /* Zone type */
> +   uint8_t z_type;
> +   /* Zone state */
> +   uint8_t z_state;
> +   uint8_t reserved[38];
> +};
> +
> +struct virtio_blk_zone_report {
> +   __virtio64 nr_zones;
> +   uint8_t reserved[56];
> +   struct virtio_blk_zone_descriptor zones[];
> +};
> +
> +/*
> + * Supported zone types.
> + */
> +
> +/* Conventional zone */
> +#define VIRTIO_BLK_ZT_CONV 1
> +/* Sequential Write Required zone */
> +#define VIRTIO_BLK_ZT_SWR  2
> +/* Sequential Write Preferred zone */
> +#define VIRTIO_BLK_ZT_SWP  3
> +
> +/*
> + * Zone states that are available for zones of all types.
> + */
> +
> +/* Not a write pointer (conventional zones only) */
> +#define VIRTIO_BLK_ZS_NOT_WP   0
> +/* Empty */
> +#define VIRTIO_BLK_ZS_EMPTY    1
> +/* Implicitly Open */
> +#define VIRTIO_BLK_ZS_IOPEN    2
> +/* Explicitly Open */
> +#define VIRTIO_BLK_ZS_EOPEN    3
> +/* Closed */
> +#define VIRTIO_BLK_ZS_CLOSED   4
> +/* Read-Only */
> +#define VIRTIO_BLK_ZS_RDONLY   13
> +/* Full */
> +#define VIRTIO_BLK_ZS_FULL 14
> +/* Offline */
> +#define VIRTIO_BLK

Re: [PATCH v12 2/7] file-posix: introduce helper functions for sysfs attributes

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote:
> Use get_sysfs_str_val() to get the string value of device
> zoned model. Then get_sysfs_zoned_model() can convert it to
> BlockZoneModel type of QEMU.
> 
> Use get_sysfs_long_val() to get the long value of zoned device
> information.
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Hannes Reinecke 
> Reviewed-by: Stefan Hajnoczi 
> Reviewed-by: Damien Le Moal 

Reviewed-by: Dmitry Fomichev 

> ---
>  block/file-posix.c   | 124 ++-
>  include/block/block_int-common.h |   3 +
>  2 files changed, 91 insertions(+), 36 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 23acffb9a4..8cb07fdb8a 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1201,66 +1201,112 @@ static int hdev_get_max_hw_transfer(int fd, struct
> stat *st)
>  #endif
>  }
>  
> -static int hdev_get_max_segments(int fd, struct stat *st)
> -{
> +/*
> + * Get a sysfs attribute value as character string.
> + */
> +static int get_sysfs_str_val(struct stat *st, const char *attribute,
> + char **val) {
>  #ifdef CONFIG_LINUX
> -    char buf[32];
> -    const char *end;
> -    char *sysfspath = NULL;
> +    g_autofree char *sysfspath = NULL;
>  int ret;
> -    int sysfd = -1;
> -    long max_segments;
> +    size_t len;
>  
> -    if (S_ISCHR(st->st_mode)) {
> -    if (ioctl(fd, SG_GET_SG_TABLESIZE, ) == 0) {
> -    return ret;
> -    }
> +    if (!S_ISBLK(st->st_mode)) {
>  return -ENOTSUP;
>  }
>  
> -    if (!S_ISBLK(st->st_mode)) {
> -    return -ENOTSUP;
> +    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
> +    major(st->st_rdev), minor(st->st_rdev),
> +    attribute);
> +    ret = g_file_get_contents(sysfspath, val, , NULL);
> +    if (ret == -1) {
> +    return -ENOENT;
>  }
>  
> -    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> -    major(st->st_rdev), minor(st->st_rdev));
> -    sysfd = open(sysfspath, O_RDONLY);
> -    if (sysfd == -1) {
> -    ret = -errno;
> -    goto out;
> +    /* The file is ended with '\n' */
> +    char *p;
> +    p = *val;
> +    if (*(p + len - 1) == '\n') {
> +    *(p + len - 1) = '\0';
>  }
> -    do {
> -    ret = read(sysfd, buf, sizeof(buf) - 1);
> -    } while (ret == -1 && errno == EINTR);
> +    return ret;
> +#else
> +    return -ENOTSUP;
> +#endif
> +}
> +
> +static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
> +{
> +    g_autofree char *val = NULL;
> +    int ret;
> +
> +    ret = get_sysfs_str_val(st, "zoned", );
>  if (ret < 0) {
> -    ret = -errno;
> -    goto out;
> -    } else if (ret == 0) {
> -    ret = -EIO;
> -    goto out;
> +    return ret;
>  }
> -    buf[ret] = 0;
> -    /* The file is ended with '\n', pass 'end' to accept that. */
> -    ret = qemu_strtol(buf, , 10, _segments);
> -    if (ret == 0 && end && *end == '\n') {
> -    ret = max_segments;
> +
> +    if (strcmp(val, "host-managed") == 0) {
> +    *zoned = BLK_Z_HM;
> +    } else if (strcmp(val, "host-aware") == 0) {
> +    *zoned = BLK_Z_HA;
> +    } else if (strcmp(val, "none") == 0) {
> +    *zoned = BLK_Z_NONE;
> +    } else {
> +    return -ENOTSUP;
>  }
> +    return 0;
> +}
>  
> -out:
> -    if (sysfd != -1) {
> -    close(sysfd);
> +/*
> + * Get a sysfs attribute value as a long integer.
> + */
> +static long get_sysfs_long_val(struct stat *st, const char *attribute)
> +{
> +#ifdef CONFIG_LINUX
> +    g_autofree char *str = NULL;
> +    const char *end;
> +    long val;
> +    int ret;
> +
> +    ret = get_sysfs_str_val(st, attribute, );
> +    if (ret < 0) {
> +    return ret;
> +    }
> +
> +    /* The file is ended with '\n', pass 'end' to accept that. */
> +    ret = qemu_strtol(str, , 10, );
> +    if (ret == 0 && end && *end == '\0') {
> +    ret = val;
>  }
> -    g_free(sysfspath);
>  return ret;
>  #else
>  return -ENOTSUP;
>  #endif
>  }
>  
> +static int hdev_get_max_segments(int fd, struct stat *st)
> +{
> +#ifdef CONFIG_LINUX
> +    int ret;
> +
> +    if (S_ISCHR(st->st_mode)) {
> +    if (ioctl(fd, SG_GET_SG_TABLESIZE, ) == 0) {
> +    return ret;
> +    }
> +

RE: [PATCH v5 01/13] hw/block/nvme: fix zone management receive reporting too many zones

2021-03-14 Thread Dmitry Fomichev
LGTM,
Reviewed-by: Dmitry Fomichev 

> -Original Message-
> From: Klaus Jensen 
> Sent: Wednesday, March 10, 2021 4:54 AM
> To: qemu-devel@nongnu.org
> Cc: Stefan Hajnoczi ; Klaus Jensen
> ; Fam Zheng ; Max Reitz
> ; Kevin Wolf ; qemu-
> bl...@nongnu.org; Gollu Appalanaidu ; Keith
> Busch ; Klaus Jensen ;
> Dmitry Fomichev 
> Subject: [PATCH v5 01/13] hw/block/nvme: fix zone management receive
> reporting too many zones
> 
> From: Klaus Jensen 
> 
> nvme_zone_mgmt_recv uses nvme_ns_nlbas() to get the number of LBAs in
> the namespace and then calculates the number of zones to report by
> incrementing slba with ZSZE until exceeding the number of LBAs as
> returned by nvme_ns_nlbas().
> 
> This is bad because the namespace might be of such as size that some
> LBAs are valid, but are not part of any zone, causing zone management
> receive to report one additional (but non-existing) zone.
> 
> Fix this with a conventional loop on i < ns->num_zones instead.
> 
> Fixes: a479335bfaf3 ("hw/block/nvme: Support Zoned Namespace Command
> Set")
> Cc: Dmitry Fomichev 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index d439e44db839..c7b9a1663dd7 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -2619,12 +2619,13 @@ static uint16_t
> nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
>  uint32_t zone_idx, zra, zrasf, partial;
>  uint64_t max_zones, nr_zones = 0;
>  uint16_t status;
> -uint64_t slba, capacity = nvme_ns_nlbas(ns);
> +uint64_t slba;
>  NvmeZoneDescr *z;
>  NvmeZone *zone;
>  NvmeZoneReportHeader *header;
>  void *buf, *buf_p;
>  size_t zone_entry_sz;
> +int i;
> 
>  req->status = NVME_SUCCESS;
> 
> @@ -2666,7 +2667,7 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl
> *n, NvmeRequest *req)
>  buf = g_malloc0(data_size);
> 
>  zone = >zone_array[zone_idx];
> -for (; slba < capacity; slba += ns->zone_size) {
> +for (i = zone_idx; i < ns->num_zones; i++) {
>  if (partial && nr_zones >= max_zones) {
>  break;
>  }
> --
> 2.30.1




Re: [PATCH] hw/block/nvme: improve invalid zasl value reporting

2021-02-09 Thread Dmitry Fomichev
On Mon, 2021-02-08 at 09:25 +0100, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> The Zone Append Size Limit (ZASL) must be at least 4096 bytes, so
> improve the user experience by adding an early parameter check in
> nvme_check_constraints.
> 
> When ZASL is still too small due to the host configuring the device for
> an even larger page size, convert the trace point in nvme_start_ctrl to
> an NVME_GUEST_ERR such that this is logged by QEMU instead of only
> traced.
> 
> Reported-by: "i...@dantalion.nl" 
> Cc: Dmitry Fomichev 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 12 ++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index c2f0c88fbf39..d96888cd2333 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -3983,8 +3983,10 @@ static int nvme_start_ctrl(NvmeCtrl *n)
>  n->zasl = n->params.mdts;
>  } else {
>  if (n->params.zasl_bs < n->page_size) {
> -trace_pci_nvme_err_startfail_zasl_too_small(n->params.zasl_bs,
> -n->page_size);
> +NVME_GUEST_ERR(pci_nvme_err_startfail_zasl_too_small,
> +   "Zone Append Size Limit (ZASL) of %d bytes is too 
> "
> +   "small; must be at least %d bytes",
> +   n->params.zasl_bs, n->page_size);
>  return -1;
>  }
>  n->zasl = 31 - clz32(n->params.zasl_bs / n->page_size);
> @@ -4503,6 +4505,12 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
> **errp)
>  error_setg(errp, "zone append size limit has to be a power of 
> 2");
>  return;
>  }
> +
> +if (n->params.zasl_bs < 4096) {
> +error_setg(errp, "zone append size limit must be at least "
> +   "4096 bytes");
> +return;
> +}
>  }
>  }

The guest error is less confusing than simply a trace. LGTM.
Reviewed-by: Dmitry Fomichev 

>  
> 
> 
> 



RE: [PATCH] hw/block/nvme: fix Close Zone

2021-02-09 Thread Dmitry Fomichev
> -Original Message-
> From: Klaus Jensen 
> Sent: Monday, February 8, 2021 12:54 PM
> To: Philippe Mathieu-Daudé 
> Cc: Keith Busch ; Dmitry Fomichev
> ; Kevin Wolf ; Max Reitz
> ; qemu-devel@nongnu.org; Niklas Cassel
> 
> Subject: Re: [PATCH] hw/block/nvme: fix Close Zone
> 
> On Feb  8 17:19, Philippe Mathieu-Daudé wrote:
> > Hi Keith,
> >
> > On 2/8/21 4:54 PM, Keith Busch wrote:
> > > On Mon, Feb 08, 2021 at 10:20:51AM +0100, Klaus Jensen wrote:
> > >> On Feb  8 10:03, Philippe Mathieu-Daudé wrote:
> > >>> Hi Dmitry, Klaus.
> > >>>
> > >>> On 2/8/21 1:32 AM, Dmitry Fomichev wrote:
> > >>>> Implicitly and Explicitly Open zones can be closed by Close Zone
> > >>>> management function. This got broken by a recent commit and now
> such
> > >>>> commands fail with Invalid Zone State Transition status.
> > >>>>
> > >>>> Modify nvm_zrm_close() function to make Close Zone work
> correctly.
> > >>>>
> > >>>> Signed-off-by: Dmitry Fomichev 
> > >>>> Fixes: 053b5a302c3("hw/block/nvme: refactor zone resource
> management")
> > >>>
> > >>> '053b5a302c3': unknown revision or path not in the working tree.
> > >>>
> > >>> If you point at an unmerged commit, why not fix it directly
> > >>> before merging?
> > >>>
> > >>
> > >> Dmitry, you OK with me squashing this fix and appending
> > >>
> > >> [dmitry: fix broken Close Zone]
> > >> Signed-off-by: Dmitry Fomichev 
> > >>
> > >> to the commit message?
> > >

Sure. I should perhaps have added Based-on tag to the commit.

> > > IMO, we should avoid the habit of rebasing and force pushes on staging
> > > trees once they're public.
> >
> > Well I had not information this patch was targeting another tree.
> >
> > If you don't want to send regular pull request, it would be useful
> > to ask the NVMe contributors to provide an information on which
> > tree their patch is based.
> >
> 
> I'm just behind on sending a that pull request. I'll do that :)


[PATCH] hw/block/nvme: fix Close Zone

2021-02-07 Thread Dmitry Fomichev
Implicitly and Explicitly Open zones can be closed by Close Zone
management function. This got broken by a recent commit and now such
commands fail with Invalid Zone State Transition status.

Modify nvm_zrm_close() function to make Close Zone work correctly.

Signed-off-by: Dmitry Fomichev 
Fixes: 053b5a302c3("hw/block/nvme: refactor zone resource management")
---
 hw/block/nvme.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 6b84e34843..c2f0c88fbf 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1308,14 +1308,13 @@ static uint16_t nvme_zrm_finish(NvmeNamespace *ns, 
NvmeZone *zone)
 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
 {
 switch (nvme_get_zone_state(zone)) {
-case NVME_ZONE_STATE_CLOSED:
-return NVME_SUCCESS;
-
 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
 nvme_aor_dec_open(ns);
 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
 /* fall through */
+case NVME_ZONE_STATE_CLOSED:
+return NVME_SUCCESS;
 
 default:
 return NVME_ZONE_INVAL_TRANSITION;
-- 
2.28.0




Re: [PATCH v2] hw/block/nvme: add missing mor/mar constraint checks

2021-02-07 Thread Dmitry Fomichev
On Tue, 2021-01-26 at 13:15 +0100, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Firstly, if zoned.max_active is non-zero, zoned.max_open must be less
> than or equal to zoned.max_active.
> 
> Secondly, if only zones.max_active is set, we have to explicitly set
> zones.max_open or we end up with an invalid MAR/MOR configuration. This
> is an artifact of the parameters not being zeroes-based like in the
> spec.
> 
> Cc: Dmitry Fomichev 
> Reported-by: Gollu Appalanaidu 
> Signed-off-by: Klaus Jensen 

Reviewed-by: Dmitry Fomichev 

> ---
> 
> v2:
> 
>   * Jumped the gun on removing the check on zoned.max_open. It should
> still be done since the device might only have a constraint on open
> zones, not active.
>   * Instead, added an explicit set of zoned.max_open if only
> zoned.max_active is specifed.
> 
>  hw/block/nvme-ns.c | 12 
>  1 file changed, 12 insertions(+)
> 
> diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> index 62b25cf69bfa..df514287b58f 100644
> --- a/hw/block/nvme-ns.c
> +++ b/hw/block/nvme-ns.c
> @@ -153,6 +153,18 @@ static int 
> nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp)
>  return -1;
>  }
>  
> 
> 
> 
> +if (ns->params.max_active_zones) {
> +if (ns->params.max_open_zones > ns->params.max_active_zones) {
> +error_setg(errp, "max_open_zones (%u) exceeds max_active_zones 
> (%u)",
> +   ns->params.max_open_zones, 
> ns->params.max_active_zones);
> +return -1;
> +}
> +
> +if (!ns->params.max_open_zones) {
> +ns->params.max_open_zones = ns->params.max_active_zones;
> +}
> +}
> +
>  if (ns->params.zd_extension_size) {
>  if (ns->params.zd_extension_size & 0x3f) {
>  error_setg(errp,



RE: [PATCH 0/2] hw/block/nvme: zoned fixes

2021-01-27 Thread Dmitry Fomichev
> -Original Message-
> From: Keith Busch 
> Sent: Wednesday, January 27, 2021 12:42 PM
> To: Klaus Jensen 
> Cc: qemu-devel@nongnu.org; Kevin Wolf ; Max Reitz
> ; qemu-bl...@nongnu.org; Dmitry Fomichev
> ; Klaus Jensen 
> Subject: Re: [PATCH 0/2] hw/block/nvme: zoned fixes
> 
> On Tue, Jan 19, 2021 at 02:54:58PM +0100, Klaus Jensen wrote:
> > From: Klaus Jensen 
> >
> > Patch [1/2] fixes the zone append bug reported by Niklas. [2/2]
> > refactors the zone write check function to return status codes in a
> > different order if there are multiple zone write violations that apply.
> 
> Looks good to me.
> 
> Reviewed-by: Keith Busch 

Besides aesthetics, I don't have any complaints about this series :)

Tested-by: Dmitry Fomichev 
Reviewed-by: Dmitry Fomichev 



[PATCH 3/3] hw/block/nvme: Add trace events for zone boundary violations

2021-01-25 Thread Dmitry Fomichev
Add three new trace events to be raised in case of zone boundary
violations during Read, Write(Zeroes) and Zone Append.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c   | 8 ++--
 hw/block/trace-events | 3 +++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index b712634c27..4059bd72e6 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1148,6 +1148,8 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, 
NvmeNamespace *ns,
 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
 status = NVME_INVALID_FIELD;
 } else if (unlikely(zone->w_ptr + nlb > bndry)) {
+trace_pci_nvme_err_append_across_boundary(slba, nlb,
+  zone->w_ptr, bndry);
 status = NVME_ZONE_BOUNDARY_ERROR;
 } else if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
 trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
@@ -1159,6 +1161,7 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, 
NvmeNamespace *ns,
zone->w_ptr);
 status = NVME_ZONE_INVALID_WRITE;
 } else if (unlikely(slba + nlb > bndry)) {
+trace_pci_nvme_err_write_across_boundary(slba, nlb, bndry);
 status = NVME_ZONE_BOUNDARY_ERROR;
 }
 }
@@ -1200,9 +1203,10 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, 
uint64_t slba,
 
 status = nvme_check_zone_state_for_read(zone);
 if (status != NVME_SUCCESS) {
-;
+trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
 } else if (unlikely(end > bndry)) {
 if (!ns->params.cross_zone_read) {
+trace_pci_nvme_err_read_across_boundary(slba, nlb, bndry);
 status = NVME_ZONE_BOUNDARY_ERROR;
 } else {
 /*
@@ -1213,6 +1217,7 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, 
uint64_t slba,
 zone++;
 status = nvme_check_zone_state_for_read(zone);
 if (status != NVME_SUCCESS) {
+trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
 break;
 }
 } while (end > nvme_zone_rd_boundary(ns, zone));
@@ -1624,7 +1629,6 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
 if (ns->params.zoned) {
 status = nvme_check_zone_read(ns, slba, nlb);
 if (status != NVME_SUCCESS) {
-trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
 goto invalid;
 }
 }
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 955ee4860f..18bbe560fc 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -127,6 +127,9 @@ pci_nvme_err_unaligned_zone_cmd(uint8_t action, uint64_t 
slba, uint64_t zslba) "
 pci_nvme_err_invalid_zone_state_transition(uint8_t action, uint64_t slba, 
uint8_t attrs) "action=0x%"PRIx8", slba=%"PRIu64", attrs=0x%"PRIx32""
 pci_nvme_err_write_not_at_wp(uint64_t slba, uint64_t zone, uint64_t wp) 
"writing at slba=%"PRIu64", zone=%"PRIu64", but wp=%"PRIu64""
 pci_nvme_err_append_not_at_start(uint64_t slba, uint64_t zone) "appending at 
slba=%"PRIu64", but zone=%"PRIu64""
+pci_nvme_err_append_across_boundary(uint64_t slba, uint32_t nlb, uint64_t wp, 
uint64_t bndry) "slba=%"PRIu64", nlb=%"PRIu32", w_ptr=%"PRIu64", 
bndry=%"PRIu64""
+pci_nvme_err_write_across_boundary(uint64_t slba, uint32_t nlb, uint64_t 
bndry) "slba=%"PRIu64", nlb=%"PRIu32", bndry=%"PRIu64""
+pci_nvme_err_read_across_boundary(uint64_t slba, uint32_t nlb, uint64_t bndry) 
"slba=%"PRIu64", nlb=%"PRIu32", bndry=%"PRIu64""
 pci_nvme_err_zone_write_not_ok(uint64_t slba, uint32_t nlb, uint32_t status) 
"slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16""
 pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint32_t status) 
"slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16""
 pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t zasl) 
"slba=%"PRIu64", nlb=%"PRIu32", zasl=%"PRIu8""
-- 
2.28.0




[PATCH 2/3] hw/block/nvme: Check zone state before checking boundaries

2021-01-25 Thread Dmitry Fomichev
The code in nvme_check_zone_write() first checks for zone boundary
condition violation and only after that it proceeds to verify that the
zone state is suitable the write to happen. This is not consistent with
nvme_check_zone_read() flow - in that function, zone state is checked
before making any boundary checks. Besides, checking that ZSLBA + NLB
does not cross the write boundary is now redundant for Zone Append and
only needs to be done for writes.

Move the check in the code to be performed for Write and Write Zeroes
commands only and to occur after zone state checks.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 67538010ef..b712634c27 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1138,13 +1138,8 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, 
NvmeNamespace *ns,
 uint64_t bndry = nvme_zone_wr_boundary(zone);
 uint16_t status;
 
-if (unlikely(slba + nlb > bndry)) {
-status = NVME_ZONE_BOUNDARY_ERROR;
-} else {
-status = nvme_check_zone_state_for_write(zone);
-}
-
-if (status != NVME_SUCCESS) {
+status = nvme_check_zone_state_for_write(zone);
+if (status) {
 trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
 } else {
 assert(nvme_wp_is_valid(zone));
@@ -1158,10 +1153,14 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, 
NvmeNamespace *ns,
 trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
 status = NVME_INVALID_FIELD;
 }
-} else if (unlikely(slba != zone->w_ptr)) {
-trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
-   zone->w_ptr);
-status = NVME_ZONE_INVALID_WRITE;
+} else {
+if (unlikely(slba != zone->w_ptr)) {
+trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
+   zone->w_ptr);
+status = NVME_ZONE_INVALID_WRITE;
+} else if (unlikely(slba + nlb > bndry)) {
+status = NVME_ZONE_BOUNDARY_ERROR;
+}
 }
 }
 
-- 
2.28.0




[PATCH 0/3] Fix zone write validation

2021-01-25 Thread Dmitry Fomichev
These patches solve a few problems that exist in zoned Write
ans Zone Append validation code.

Dmitry Fomichev (3):
  hw/block/nvme: Check for zone boundary during append
  hw/block/nvme: Check zone state before checking boundaries
  hw/block/nvme: Add trace events for zone boundary violations

 hw/block/nvme.c   | 35 ---
 hw/block/trace-events |  3 +++
 2 files changed, 23 insertions(+), 15 deletions(-)

-- 
2.28.0




[PATCH 1/3] hw/block/nvme: Check for zone boundary during append

2021-01-25 Thread Dmitry Fomichev
It is observed that with the existing code it is possible to keep
appending to a zone indefinitely. To fix, add the missing check to
verify that the zone append is not going to write beyond zone capacity.

Reported-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f64676a930..67538010ef 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1135,9 +1135,10 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, 
NvmeNamespace *ns,
   NvmeZone *zone, uint64_t slba,
   uint32_t nlb, bool append)
 {
+uint64_t bndry = nvme_zone_wr_boundary(zone);
 uint16_t status;
 
-if (unlikely((slba + nlb) > nvme_zone_wr_boundary(zone))) {
+if (unlikely(slba + nlb > bndry)) {
 status = NVME_ZONE_BOUNDARY_ERROR;
 } else {
 status = nvme_check_zone_state_for_write(zone);
@@ -1151,8 +1152,9 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, 
NvmeNamespace *ns,
 if (unlikely(slba != zone->d.zslba)) {
 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
 status = NVME_INVALID_FIELD;
-}
-if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
+} else if (unlikely(zone->w_ptr + nlb > bndry)) {
+status = NVME_ZONE_BOUNDARY_ERROR;
+} else if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
 trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
 status = NVME_INVALID_FIELD;
 }
-- 
2.28.0




Re: [PATCH 3/3] hw/block/nvme: align with existing style

2021-01-25 Thread Dmitry Fomichev
On Mon, 2021-01-25 at 09:22 +0100, Klaus Jensen wrote:
> From: Gollu Appalanaidu 
> 
> Change status checks to align with the existing style and remove the
> explicit check against NVME_SUCCESS.
> 
> Cc: Dmitry Fomichev 
> Signed-off-by: Gollu Appalanaidu 
> Reviewed-by: Klaus Jensen 

Looks good.
Reviewed-by: Dmitry Fomichev 

> ---
>  hw/block/nvme.c | 20 ++--
>  1 file changed, 10 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 1be5b54e0fed..98d84fe26644 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1198,7 +1198,7 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, 
> NvmeNamespace *ns,
>  status = nvme_check_zone_state_for_write(zone);
>  }
>  
> 
> 
> 
> -if (status != NVME_SUCCESS) {
> +if (status) {
>  trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
>  } else {
>  assert(nvme_wp_is_valid(zone));
> @@ -1253,7 +1253,7 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, 
> uint64_t slba,
>  uint16_t status;
>  
> 
> 
> 
>  status = nvme_check_zone_state_for_read(zone);
> -if (status != NVME_SUCCESS) {
> +if (status) {
>  ;
>  } else if (unlikely(end > bndry)) {
>  if (!ns->params.cross_zone_read) {
> @@ -1266,7 +1266,7 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, 
> uint64_t slba,
>  do {
>  zone++;
>  status = nvme_check_zone_state_for_read(zone);
> -if (status != NVME_SUCCESS) {
> +if (status) {
>  break;
>  }
>  } while (end > nvme_zone_rd_boundary(ns, zone));
> @@ -1677,7 +1677,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
>  
> 
> 
> 
>  if (ns->params.zoned) {
>  status = nvme_check_zone_read(ns, slba, nlb);
> -if (status != NVME_SUCCESS) {
> +if (status) {
>  trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
>  goto invalid;
>  }
> @@ -1748,12 +1748,12 @@ static uint16_t nvme_do_write(NvmeCtrl *n, 
> NvmeRequest *req, bool append,
>  zone = nvme_get_zone_by_slba(ns, slba);
>  
> 
> 
> 
>  status = nvme_check_zone_write(n, ns, zone, slba, nlb, append);
> -if (status != NVME_SUCCESS) {
> +if (status) {
>  goto invalid;
>  }
>  
> 
> 
> 
>  status = nvme_auto_open_zone(ns, zone);
> -if (status != NVME_SUCCESS) {
> +if (status) {
>  goto invalid;
>  }
>  
> 
> 
> 
> @@ -1852,14 +1852,14 @@ static uint16_t nvme_open_zone(NvmeNamespace *ns, 
> NvmeZone *zone,
>  switch (state) {
>  case NVME_ZONE_STATE_EMPTY:
>  status = nvme_aor_check(ns, 1, 0);
> -if (status != NVME_SUCCESS) {
> +if (status) {
>  return status;
>  }
>  nvme_aor_inc_active(ns);
>  /* fall through */
>  case NVME_ZONE_STATE_CLOSED:
>  status = nvme_aor_check(ns, 0, 1);
> -if (status != NVME_SUCCESS) {
> +if (status) {
>  if (state == NVME_ZONE_STATE_EMPTY) {
>  nvme_aor_dec_active(ns);
>  }
> @@ -1972,7 +1972,7 @@ static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, 
> NvmeZone *zone)
>  
> 
> 
> 
>  if (state == NVME_ZONE_STATE_EMPTY) {
>  status = nvme_aor_check(ns, 1, 0);
> -if (status != NVME_SUCCESS) {
> +if (status) {
>  return status;
>  }
>  nvme_aor_inc_active(ns);
> @@ -3301,7 +3301,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, 
> NvmeRequest *req)
>  
> 
> 
> 
>  ret = nvme_dma(n, (uint8_t *), sizeof(timestamp),
> DMA_DIRECTION_TO_DEVICE, req);
> -if (ret != NVME_SUCCESS) {
> +if (ret) {
>  return ret;
>  }
>  
> 
> 
> 



Re: [PATCH 2/2] hw/block/nvme: refactor the logic for zone write checks

2021-01-25 Thread Dmitry Fomichev
On Tue, 2021-01-19 at 14:55 +0100, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Refactor the zone write check logic such that the most "meaningful"
> error is returned first. That is, first, if the zone is not writable,
> return an appropriate status code for that. Then, make sure we are
> actually writing at the write pointer and finally check that we do not
> cross the zone write boundary. This aligns with the "priority" of status
> codes for zone read checks.

Right, all boundary checks need to be performed after validating the zone
state and WP checks. 

> 
> Also add a couple of additional descriptive trace events and remove an
> always true assert.

Adding the new trace events looks like a great idea. I think that assert
is useful, but I have no qualms about removing it.

Overall, this looks good to me, except the part about open coding
ZA validation :)

Best regards,
Dmitry

> 
> Cc: Dmitry Fomichev 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 47 +++
>  hw/block/trace-events |  5 +
>  2 files changed, 26 insertions(+), 26 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index f05dea657b01..193a27259dda 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1106,56 +1106,51 @@ static inline NvmeZone 
> *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
>  
> 
> 
> 
>  static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
>  {
> -uint16_t status;
> +uint64_t zslba = zone->d.zslba;
>  
> 
> 
> 
>  switch (nvme_get_zone_state(zone)) {
>  case NVME_ZONE_STATE_EMPTY:
>  case NVME_ZONE_STATE_IMPLICITLY_OPEN:
>  case NVME_ZONE_STATE_EXPLICITLY_OPEN:
>  case NVME_ZONE_STATE_CLOSED:
> -status = NVME_SUCCESS;
> -break;
> +return NVME_SUCCESS;
>  case NVME_ZONE_STATE_FULL:
> -status = NVME_ZONE_FULL;
> -break;
> +trace_pci_nvme_err_zone_is_full(zslba);
> +return NVME_ZONE_FULL;
>  case NVME_ZONE_STATE_OFFLINE:
> -status = NVME_ZONE_OFFLINE;
> -break;
> +trace_pci_nvme_err_zone_is_offline(zslba);
> +return NVME_ZONE_OFFLINE;
>  case NVME_ZONE_STATE_READ_ONLY:
> -status = NVME_ZONE_READ_ONLY;
> -break;
> +trace_pci_nvme_err_zone_is_read_only(zslba);
> +return NVME_ZONE_READ_ONLY;
>  default:
>  assert(false);
>  }
> -
> -return status;
>  }
>  
>  static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
>    NvmeZone *zone, uint64_t slba,
>    uint32_t nlb)
>  {
> +uint64_t zcap = nvme_zone_wr_boundary(zone);
>  uint16_t status;
>  
> 
> 
> 
> -if (unlikely((slba + nlb) > nvme_zone_wr_boundary(zone))) {
> -status = NVME_ZONE_BOUNDARY_ERROR;
> -} else {
> -status = nvme_check_zone_state_for_write(zone);
> +status = nvme_check_zone_state_for_write(zone);
> +if (status) {
> +return status;
>  }
>  
> 
> 
> 
> -if (status != NVME_SUCCESS) {
> -trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
> -} else {
> -assert(nvme_wp_is_valid(zone));
> -
> -if (unlikely(slba != zone->w_ptr)) {
> -trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
> -   zone->w_ptr);
> -status = NVME_ZONE_INVALID_WRITE;
> -}
> +if (unlikely(slba != zone->w_ptr)) {
> +trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
> +return NVME_ZONE_INVALID_WRITE;
>  }
>  
> 
> 
> 
> -return status;
> +if (unlikely((slba + nlb) > zcap)) {
> +trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
> +return NVME_ZONE_BOUNDARY_ERROR;
> +}
> +
> +return NVME_SUCCESS;
>  }
>  
> 
>  static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index 78d76b0a71c1..c80b02b85ac9 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -127,6 +127,11 @@ pci_nvme_err_unaligned_zone_cmd(uint8_t action, uint64_t 
> slba, uint64_t zslba) "
>  pci_nvme_err_invalid_zone_state_transition(uint8_t action, uint64_t slba, 
> uint8_t attrs) "action=0x%"PRIx8", slba=%"PRIu64", attrs=0x%"PRIx32""
>  pci_nvme_err_write_not_at_wp(uint64_t slba, uint64_t zone, uint64_t wp) 
> "writing at slba=%"PR

Re: [PATCH 1/2] hw/block/nvme: fix zone boundary check for append

2021-01-25 Thread Dmitry Fomichev
On Tue, 2021-01-19 at 14:54 +0100, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> When a zone append is processed the controller checks that validity of
> the write before assigning the LBA to the append command. This causes
> the boundary check to be wrong.
> 
> Fix this by checking the write *after* assigning the LBA. Remove the
> append special case from the nvme_check_zone_write and open code it in
> nvme_do_write, assigning the slba when basic sanity checks have been
> performed. Then check the validity of the resulting write like any other
> write command.
> 

Klaus,

I tested this series and it works fine. I however think that the problem that
Niklas has found can be fixed in the much simpler way by applying the
following to the existing code -

--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1152,6 +1152,9 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, 
NvmeNamespace *ns,
 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
 status = NVME_INVALID_FIELD;
 }
+if (zone->w_ptr + nlb > nvme_zone_wr_boundary(zone)) {
+status = NVME_ZONE_BOUNDARY_ERROR;
+}
 if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
 trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
 status = NVME_INVALID_FIELD;

I am going to send a few patches that take this approach, please take a look. 
In my
testing, it works just as well :)

> 
> In the process, also fix a missing endianness conversion for the zone
> append ALBA.

Great catch! This could be placed to a separate patch though...
A few more comments below.


Reported-by: Niklas Cassel 
Cc: Dmitry Fomichev 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 46 --
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 309c26db8ff7..f05dea657b01 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1133,7 +1133,7 @@ static uint16_t nvme_check_zone_state_for_write(NvmeZone 
*zone)
 
 static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
   NvmeZone *zone, uint64_t slba,
-  uint32_t nlb, bool append)
+  uint32_t nlb)
 {
 uint16_t status;
 
@@ -1147,16 +1147,8 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, 
NvmeNamespace *ns,
 trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
 } else {
 assert(nvme_wp_is_valid(zone));
-if (append) {
-if (unlikely(slba != zone->d.zslba)) {
-trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
-status = NVME_INVALID_FIELD;
-}
-if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
-trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
-status = NVME_INVALID_FIELD;
-}
-} else if (unlikely(slba != zone->w_ptr)) {
+
+if (unlikely(slba != zone->w_ptr)) {
 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
zone->w_ptr);
 status = NVME_ZONE_INVALID_WRITE;
@@ -1294,10 +1286,9 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, 
NvmeRequest *req,
 }
 }
 
-static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
- uint32_t nlb)
+static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
+ uint32_t nlb)
 {
-uint64_t result = zone->w_ptr;
 uint8_t zs;
 
 zone->w_ptr += nlb;
@@ -1313,8 +1304,6 @@ static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, 
NvmeZone *zone,
 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
 }
 }
-
-return result;
 }

 static inline bool nvme_is_write(NvmeRequest *req)
@@ -1692,7 +1681,24 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest 
*req, bool append,
 if (ns->params.zoned) {
 zone = nvme_get_zone_by_slba(ns, slba);

-status = nvme_check_zone_write(n, ns, zone, slba, nlb, append);
+if (append) {

This is what I see as a drawback of this approach.
You have to move the ZA checks that were previously nicely tucked in
nvme_check_zone_write() to the spot below and now this validation is done
in two different places for appends and regular writes. This can be avoided.
 
+if (unlikely(slba != zone->d.zslba)) {
+trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
+status = NVME_INVALID_FIELD;
+goto invalid;
+}
+
+if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
+  

Re: [PATCH 0/2] hw/block/nvme: zoned fixes

2021-01-25 Thread Dmitry Fomichev
On Mon, 2021-01-25 at 08:25 +0100, Klaus Jensen wrote:
> On Jan 19 14:54, Klaus Jensen wrote:
> > From: Klaus Jensen 
> > 
> > Patch [1/2] fixes the zone append bug reported by Niklas. [2/2]
> > refactors the zone write check function to return status codes in a
> > different order if there are multiple zone write violations that apply.
> > 
> > Klaus Jensen (2):
> >   hw/block/nvme: fix zone boundary check for append
> >   hw/block/nvme: refactor the logic for zone write checks
> > 
> >  hw/block/nvme.c   | 89 +--
> >  hw/block/trace-events |  5 +++
> >  2 files changed, 48 insertions(+), 46 deletions(-)
> > 
> 
> Pinging for a review. Dmitry maybe?

I am testing it now, will post my review shortly.

DF


[PATCH] hw/block/nvme: Correct error status for unaligned ZA

2021-01-17 Thread Dmitry Fomichev
TP 4053 says (in section 2.3.1.1) -
... if a Zone Append command specifies a ZSLBA that is not the lowest
logical block address in that zone, then the controller shall abort
that command with a status code of Invalid Field In Command.

In the code, Zone Invalid Write is returned instead, fix this.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index a8dc13553d..f64676a930 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1150,7 +1150,7 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, 
NvmeNamespace *ns,
 if (append) {
 if (unlikely(slba != zone->d.zslba)) {
 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
-status = NVME_ZONE_INVALID_WRITE;
+status = NVME_INVALID_FIELD;
 }
 if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
 trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
-- 
2.28.0




Re: [PATCH 0/6] hw/block/nvme: zoned misc fixes

2021-01-17 Thread Dmitry Fomichev
On Mon, 2021-01-11 at 13:32 +0100, Klaus Jensen wrote:
> From: Klaus Jensen 

Klaus,

This series looks good to me. I've made a comment in "zero out zones on reset"
patch, but it is only cosmetic in nature. I am going to send a patch
with another small fix in ZNS code.

Best regards,
Dmitry

Tested-by: Dmitry Fomichev 
Reviewed-by: Dmitry Fomichev 

> 
> These are some follow-up patches to the just merged zoned series.
> 
> The biggest addition here is asynchronous zeroing of zones on reset.
> 
> Klaus Jensen (6):
>   hw/block/nvme: fix shutdown/reset logic
>   hw/block/nvme: merge implicitly/explicitly opened processing masks
>   hw/block/nvme: enum style fix
>   hw/block/nvme: zero out zones on reset
>   hw/block/nvme: add missing string representations for commands
>   hw/block/nvme: remove unnecessary check for append
> 
>  hw/block/nvme-ns.h|   4 +-
>  hw/block/nvme.h   |   4 +
>  include/block/nvme.h  |   4 +-
>  hw/block/nvme.c   | 200 +++---
>  hw/block/trace-events |   1 +
>  5 files changed, 140 insertions(+), 73 deletions(-)
> 
> -- 
> 2.30.0
> 



Re: [PATCH 4/6] hw/block/nvme: zero out zones on reset

2021-01-17 Thread Dmitry Fomichev
On Mon, 2021-01-11 at 13:32 +0100, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> The zoned command set specification states that "All logical blocks in a
> zone *shall* be marked as deallocated when [the zone is reset]". Since
> the device guarantees 0x00 to be read from deallocated blocks we have to
> issue a pwrite_zeroes since we cannot be sure that a discard will do
> anything. But typically, this will be achieved with an efficient
> unmap/discard operation.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 150 +++---
>  hw/block/trace-events |   1 +
>  2 files changed, 113 insertions(+), 38 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 7c2ec17ad7d9..b3658595fe1b 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1371,6 +1371,53 @@ static void nvme_aio_discard_cb(void *opaque, int ret)
>  nvme_enqueue_req_completion(nvme_cq(req), req);
>  }
>  
> 
> 
> 
> +struct nvme_zone_reset_ctx {
> +NvmeRequest *req;
> +NvmeZone*zone;
> +};
> +
> +static void nvme_aio_zone_reset_cb(void *opaque, int ret)
> +{
> +struct nvme_zone_reset_ctx *ctx = opaque;
> +NvmeRequest *req = ctx->req;
> +NvmeNamespace *ns = req->ns;
> +NvmeZone *zone = ctx->zone;
> +uintptr_t *resets = (uintptr_t *)>opaque;
> +
> +g_free(ctx);
> +
> +trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba);
> +
> +if (!ret) {
> +switch (nvme_get_zone_state(zone)) {
> +case NVME_ZONE_STATE_EXPLICITLY_OPEN:
> +case NVME_ZONE_STATE_IMPLICITLY_OPEN:
> +nvme_aor_dec_open(ns);
> +/* fall through */
> +case NVME_ZONE_STATE_CLOSED:
> +nvme_aor_dec_active(ns);
> +/* fall through */
> +case NVME_ZONE_STATE_FULL:
> +zone->w_ptr = zone->d.zslba;
> +zone->d.wp = zone->w_ptr;
> +nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
> +/* fall through */
> +default:
> +break;
> +}
> +} else {
> +nvme_aio_err(req, ret);
> +}
> +
> +(*resets)--;
> +
> +if (*resets) {
> +return;
> +}
> +
> +nvme_enqueue_req_completion(nvme_cq(req), req);

A tiny nit - the above could be simplified to

   if (!*resets) {
   nvme_enqueue_req_completion(nvme_cq(req), req);
   }

Overall, this patch looks good to me.

> +}
> +
>  struct nvme_compare_ctx {
>  QEMUIOVector iov;
>  uint8_t *bounce;
> @@ -1735,7 +1782,8 @@ static uint16_t 
> nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
>  return NVME_SUCCESS;
>  }
>  
> 
> 
> 
> -typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState);
> +typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
> + NvmeRequest *);
>  
> 
> 
> 
>  enum NvmeZoneProcessingMask {
>  NVME_PROC_CURRENT_ZONE= 0,
> @@ -1746,7 +1794,7 @@ enum NvmeZoneProcessingMask {
>  };
>  
> 
> 
> 
>  static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
> -   NvmeZoneState state)
> +   NvmeZoneState state, NvmeRequest *req)
>  {
>  uint16_t status;
>  
> 
> 
> 
> @@ -1779,7 +1827,7 @@ static uint16_t nvme_open_zone(NvmeNamespace *ns, 
> NvmeZone *zone,
>  }
>  
> 
> 
> 
>  static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
> -NvmeZoneState state)
> +NvmeZoneState state, NvmeRequest *req)
>  {
>  switch (state) {
>  case NVME_ZONE_STATE_EXPLICITLY_OPEN:
> @@ -1795,7 +1843,7 @@ static uint16_t nvme_close_zone(NvmeNamespace *ns, 
> NvmeZone *zone,
>  }
>  
> 
> 
> 
>  static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
> - NvmeZoneState state)
> + NvmeZoneState state, NvmeRequest *req)
>  {
>  switch (state) {
>  case NVME_ZONE_STATE_EXPLICITLY_OPEN:
> @@ -1818,30 +1866,42 @@ static uint16_t nvme_finish_zone(NvmeNamespace *ns, 
> NvmeZone *zone,
>  }
>  
> 
> 
> 
>  static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
> -NvmeZoneState state)
> +NvmeZoneState state, NvmeRequest *req)
>  {
> +uintptr_t *resets = (uintptr_t *)>opaque;
> +struct nvme_zone_reset_ctx *ctx;
> +
>  switch (state) {
> -case NVME_ZONE_STATE_EXPLICITLY_OPEN:
> -case NVME_ZONE_STATE_IMPLICITLY_OPEN:
> -nvme_aor_dec_open(ns);
> -/* fall through */
> -case NVME_ZONE_STATE_CLOSED:
> -nvme_aor_dec_active(ns);
> -/* fall through */
> -case NVME_ZONE_STATE_FULL:
> -zone->w_ptr = zone->d.zslba;
> -zone->d.wp = zone->w_ptr;
> -nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
> -/* fall through */
>  case NVME_ZONE_STATE_EMPTY:

RE: [PATCH v11 00/13] hw/block/nvme: Support Namespace Types and Zoned Namespace Command Set

2020-12-10 Thread Dmitry Fomichev
> -Original Message-
> From: Klaus Jensen 
> Sent: Wednesday, December 9, 2020 4:58 AM
> To: Dmitry Fomichev 
> Cc: Keith Busch ; Klaus Jensen
> ; Kevin Wolf ; Philippe
> Mathieu-Daudé ; Max Reitz ;
> Maxim Levitsky ; Fam Zheng ;
> Niklas Cassel ; Damien Le Moal
> ; qemu-bl...@nongnu.org; qemu-
> de...@nongnu.org; Alistair Francis ; Matias
> Bjorling 
> Subject: Re: [PATCH v11 00/13] hw/block/nvme: Support Namespace Types
> and Zoned Namespace Command Set
> 
> Hi Dmitry,
> 
> By and large, this looks OK to me. There are still some issues here and
> there, and some comments of mine that you did not address, but I will
> follow up with patches to fix that. Let's get this merged.
> 
> It looks like the nvme-next you rebased on is slightly old and missing
> two commits:
> 
>   "hw/block/nvme: remove superfluous NvmeCtrl parameter" and
>   "hw/block/nvme: pull aio error handling"
> 
> It caused a couple of conflicts, but nothing that I couldn't fix up.
> 
> Since I didn't manage to convince anyone about the zsze and zcap
> parameters being in terms of LBAs, I'll revert that to be
> 'zoned.zone_size' and 'zoned.zone_capacity'.
> 
> Finally, would you accept that we skip "hw/block/nvme: Add injection of
> Offline/Read-Only zones" for now? I'd like to discuss it a bit since I
> think the random injects feels a bit ad-hoc. Back when I did OCSSD
> emulation with Hans, we did something like this for setting up state
> through a descriptor text file - I think we should explore something
> like that before we lock down the two parameters. I'll amend the final
> documentation commit to not include those parameters.
> 
> Sounds good?

Klaus,

Sounds great! Sure, we can leave out the injection patch. It  was made
to increase our internal test coverage, but it is not ideal. Since the zones
are injected randomly, there is no consistency between test runs and
it is impossible to reliably create many specific test cases (e.g. the first or
the last zone is offline). The descriptor input file seems like a much more
flexible and capable approach. If you have something in works, I'll be
happy to discuss or review.

Thank you for your very thorough reviews!

Cheers,
Dmitry

> 
> Otherwise, I think this is mergeable to nvme-next. So, for the series
> (excluding "hw/block/nvme: Add injection of Offline/Read-Only zones"):
> 
> Reviewed-by: Klaus Jensen 
> 
> On Dec  9 05:03, Dmitry Fomichev wrote:
> > v10 -> v11:
> >
> >  - Address review comments by Klaus.
> >
> >  - Add a patch to separate the handling of controller reset
> >and subsystem shutdown. Place the patch at the beginning
> >of the series so it can be picked up separately.
> >
> >  - Rebase on the current nvme-next branch.
> >
> > v9 -> v10:
> >
> >  - Correctly check for MDTS in Zone Management Receive handler.
> >
> >  - Change Klaus' "Reviewed-by" email in UUID patch.
> >
> > v8 -> v9:
> >
> >  - Move the modifications to "include/block/nvme.h" made to
> >introduce ZNS-related definitions to a separate patch.
> >
> >  - Add a new struct, NvmeZonedResult, along the same lines as the
> >existing NvmeAerResult, to carry Zone Append LBA returned to
> >the host. Now, there is no need to modify NvmeCqe struct except
> >renaming DW1 field from "rsvd" to "dw1".
> >
> >  - Add check for MDTS in Zone Management Receive handler.
> >
> >  - Remove checks for ns->attached since the value of this flag
> >is always true for now.
> >
> >  - Rebase to the current quemu-nvme/nvme-next branch.
> >
> > v7 -> v8:
> >
> >  - Move refactoring commits to the front of the series.
> >
> >  - Remove "attached" and "fill_pattern" device properties.
> >
> >  - Only close open zones upon subsystem shutdown, not when CC.EN flag
> >is set to 0. Avoid looping through all zones by iterating through
> >lists of open and closed zones.
> >
> >  - Improve bulk processing of zones aka zoned operations with "all"
> >flag set. Avoid looping through the entire zone array for all zone
> >operations except Offline Zone.
> >
> >  - Prefix ZNS-related property names with "zoned.". The "zoned" Boolean
> >property is retained to turn on zoned command set as it is much more
> >intuitive and user-friendly compared to setting a magic number value
> >to csi property.
> >
> >  - Address review comments.
> >
> >  - Remove unused trace events.
&g

[PATCH v11 01/13] hw/block/nvme: Process controller reset and shutdown differently

2020-12-08 Thread Dmitry Fomichev
Controller reset ans subsystem shutdown are handled very much the same
in the current code, but some of the steps should be different in these
two cases.

Introduce two new functions, nvme_reset_ctrl() and nvme_shutdown_ctrl(),
to separate some portions of the code from nvme_clear_ctrl(). The steps
that are made different between reset and shutdown are that BAR.CC is not
reset to zero upon the shutdown and namespace data is flushed to
backing storage as a part of shutdown handling, but not upon reset.

Suggested-by: Klaus Jensen 
Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme-ns.h |  2 +-
 hw/block/nvme-ns.c |  2 +-
 hw/block/nvme.c| 24 ++--
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 44bf6271b7..ed3d7e65d5 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -73,6 +73,6 @@ typedef struct NvmeCtrl NvmeCtrl;
 
 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
-void nvme_ns_flush(NvmeNamespace *ns);
+void nvme_ns_shutdown(NvmeNamespace *ns);
 
 #endif /* NVME_NS_H */
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 847069a66e..9b95e2ed33 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -130,7 +130,7 @@ void nvme_ns_drain(NvmeNamespace *ns)
 blk_drain(ns->blkconf.blk);
 }
 
-void nvme_ns_flush(NvmeNamespace *ns)
+void nvme_ns_shutdown(NvmeNamespace *ns)
 {
 blk_flush(ns->blkconf.blk);
 }
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 59990e00bc..10acb7e7f0 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2197,6 +2197,20 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
 n->aer_queued = 0;
 n->outstanding_aers = 0;
 n->qs_created = false;
+}
+
+static void nvme_ctrl_reset(NvmeCtrl *n)
+{
+nvme_clear_ctrl(n);
+n->bar.cc = 0;
+}
+
+static void nvme_ctrl_shutdown(NvmeCtrl *n)
+{
+NvmeNamespace *ns;
+int i;
+
+nvme_clear_ctrl(n);
 
 for (i = 1; i <= n->num_namespaces; i++) {
 ns = nvme_ns(n, i);
@@ -2204,10 +2218,8 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
 continue;
 }
 
-nvme_ns_flush(ns);
+nvme_ns_shutdown(ns);
 }
-
-n->bar.cc = 0;
 }
 
 static int nvme_start_ctrl(NvmeCtrl *n)
@@ -2374,12 +2386,12 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
 }
 } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
 trace_pci_nvme_mmio_stopped();
-nvme_clear_ctrl(n);
+nvme_ctrl_reset(n);
 n->bar.csts &= ~NVME_CSTS_READY;
 }
 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
 trace_pci_nvme_mmio_shutdown_set();
-nvme_clear_ctrl(n);
+nvme_ctrl_shutdown(n);
 n->bar.cc = data;
 n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
@@ -2990,7 +3002,7 @@ static void nvme_exit(PCIDevice *pci_dev)
 {
 NvmeCtrl *n = NVME(pci_dev);
 
-nvme_clear_ctrl(n);
+nvme_ctrl_shutdown(n);
 g_free(n->cq);
 g_free(n->sq);
 g_free(n->aer_reqs);
-- 
2.28.0




[PATCH v11 12/13] hw/block/nvme: Add injection of Offline/Read-Only zones

2020-12-08 Thread Dmitry Fomichev
ZNS specification defines two zone conditions for the zones that no
longer can function properly, possibly because of flash wear or other
internal fault. It is useful to be able to "inject" a small number of
such zones for testing purposes.

This commit defines two optional device properties, "offline_zones"
and "rdonly_zones". Users can assign non-zero values to these variables
to specify the number of zones to be initialized as Offline or
Read-Only. The actual number of injected zones may be smaller than the
requested amount - Read-Only and Offline counts are expected to be much
smaller than the total number of zones on a drive.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h |  2 ++
 hw/block/nvme-ns.c | 53 ++
 2 files changed, 55 insertions(+)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index f8f3c28c36..1196865b7a 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -36,6 +36,8 @@ typedef struct NvmeNamespaceParams {
 uint32_t max_active_zones;
 uint32_t max_open_zones;
 uint32_t zd_extension_size;
+uint32_t nr_offline_zones;
+uint32_t nr_rdonly_zones;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index c5a7bafcf7..0a8b741bc9 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -21,6 +21,7 @@
 #include "sysemu/sysemu.h"
 #include "sysemu/block-backend.h"
 #include "qapi/error.h"
+#include "crypto/random.h"
 
 #include "hw/qdev-properties.h"
 #include "hw/qdev-core.h"
@@ -163,6 +164,21 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace 
*ns, Error **errp)
 }
 }
 
+if (ns->params.max_open_zones < ns->num_zones) {
+if (ns->params.nr_offline_zones >
+ns->num_zones - ns->params.max_open_zones) {
+error_setg(errp, "offline_zones value %u is too large",
+ns->params.nr_offline_zones);
+return -1;
+}
+if (ns->params.nr_rdonly_zones + ns->params.nr_offline_zones >
+ns->num_zones - ns->params.max_open_zones) {
+error_setg(errp, "rdonly_zones value %u is too large",
+ns->params.nr_rdonly_zones);
+return -1;
+}
+}
+
 return 0;
 }
 
@@ -171,7 +187,9 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
 uint64_t start = 0, zone_size = ns->zone_size;
 uint64_t capacity = ns->num_zones * zone_size;
 NvmeZone *zone;
+uint32_t rnd;
 int i;
+uint16_t zs;
 
 ns->zone_array = g_new0(NvmeZone, ns->num_zones);
 if (ns->params.zd_extension_size) {
@@ -203,6 +221,37 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
 if (is_power_of_2(ns->zone_size)) {
 ns->zone_size_log2 = 63 - clz64(ns->zone_size);
 }
+
+/* If required, make some zones Offline or Read Only */
+
+for (i = 0; i < ns->params.nr_offline_zones; i++) {
+do {
+qcrypto_random_bytes(, sizeof(rnd), NULL);
+rnd %= ns->num_zones;
+} while (rnd < ns->params.max_open_zones);
+zone = >zone_array[rnd];
+zs = nvme_get_zone_state(zone);
+if (zs != NVME_ZONE_STATE_OFFLINE) {
+nvme_set_zone_state(zone, NVME_ZONE_STATE_OFFLINE);
+} else {
+i--;
+}
+}
+
+for (i = 0; i < ns->params.nr_rdonly_zones; i++) {
+do {
+qcrypto_random_bytes(, sizeof(rnd), NULL);
+rnd %= ns->num_zones;
+} while (rnd < ns->params.max_open_zones);
+zone = >zone_array[rnd];
+zs = nvme_get_zone_state(zone);
+if (zs != NVME_ZONE_STATE_OFFLINE &&
+zs != NVME_ZONE_STATE_READ_ONLY) {
+nvme_set_zone_state(zone, NVME_ZONE_STATE_READ_ONLY);
+} else {
+i--;
+}
+}
 }
 
 static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace *ns, int lba_index)
@@ -368,6 +417,10 @@ static Property nvme_ns_props[] = {
params.max_open_zones, 0),
 DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
params.zd_extension_size, 0),
+DEFINE_PROP_UINT32("zoned.offline_zones", NvmeNamespace,
+   params.nr_offline_zones, 0),
+DEFINE_PROP_UINT32("zoned.rdonly_zones", NvmeNamespace,
+   params.nr_rdonly_zones, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
-- 
2.28.0




[PATCH v11 00/13] hw/block/nvme: Support Namespace Types and Zoned Namespace Command Set

2020-12-08 Thread Dmitry Fomichev
it.

 - Use clz64/clz32 instead of defining nvme_ilog2() function.

 - Simplify rpt_empty_id_struct() code, move nvme_fill_data() back
   to ZNS patch.

 - Fix a power-on processing bug.

 - Rename NVME_CMD_ZONE_APND to NVME_CMD_ZONE_APPEND.

 - Make the list of review comments addressed in v2 of the series
   (see below).

v2 -> v3:

 - Moved nvme_fill_data() function to the NSTypes patch as it is
   now used there to output empty namespace identify structs.
 - Fixed typo in Maxim's email address.

v1 -> v2:

 - Rebased on top of qemu-nvme/next branch.
 - Incorporated feedback from Klaus and Alistair.
* Allow a subset of CSE log to be read, not the entire log
* Assign admin command entries in CSE log to ACS fields
* Set LPA bit 1 to indicate support of CSE log page
* Rename CC.CSS value CSS_ALL_NSTYPES (110b) to CSS_CSI
* Move the code to assign lbaf.ds to a separate patch
* Remove the change in firmware revision
* Change "driver" to "device" in comments and annotations
* Rename ZAMDS to ZASL
* Correct a few format expressions and some wording in
  trace event definitions
* Remove validation code to return NVME_CAP_EXCEEDED error
* Make ZASL to be equal to MDTS if "zone_append_size_limit"
  module parameter is not set
* Clean up nvme_zoned_init_ctrl() to make size calculations
  less confusing
* Avoid changing module parameters, use separate n/s variables
  if additional calculations are necessary to convert parameters
  to running values
* Use NVME_DEFAULT_ZONE_SIZE to assign the default zone size value
* Use default 0 for zone capacity meaning that zone capacity will
  be equal to zone size by default
* Issue warnings if user MAR/MOR values are too large and have
  to be adjusted
* Use unsigned values for MAR/MOR
 - Dropped "Simulate Zone Active excursions" patch.
   Excursion behavior may depend on the internal controller
   architecture and therefore be vendor-specific.
 - Dropped support for Zone Attributes and zoned AENs for now.
   These features can be added in a future series.
 - NS Types support is extended to handle active/inactive namespaces.
 - Update the write pointer after backing storage I/O completion, not
   before. This makes the emulation to run correctly in case of
   backing device failures.
 - Avoid division in the I/O path if the device zone size is
   a power of two (the most common case). Zone index then can be
   calculated by using bit shift.
 - A few reported bugs have been fixed.
 - Indentation in function definitions has been changed to make it
   the same as the rest of the code.


Zoned Namespace (ZNS) Command Set is a newly introduced command set
published by the NVM Express, Inc. organization as TP 4053. The main
design goals of ZNS are to provide hardware designers the means to
reduce NVMe controller complexity and to allow achieving a better I/O
latency and throughput. SSDs that implement this interface are
commonly known as ZNS SSDs.

This command set is implementing a zoned storage model, similarly to
ZAC/ZBC. As such, there is already support in Linux, allowing one to
perform the majority of tasks needed for managing ZNS SSDs.

The Zoned Namespace Command Set relies on another TP, known as
Namespace Types (NVMe TP 4056), which introduces support for having
multiple command sets per namespace.

Both ZNS and Namespace Types specifications can be downloaded by
visiting the following link -

https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs.zip

This patch series adds Namespace Types support and zoned namespace
emulation capability to the existing NVMe PCI device.


Based-on: <20201104102248.32168-1-...@irrelevant.dk>
Dmitry Fomichev (11):
  hw/block/nvme: Process controller reset and shutdown differently
  hw/block/nvme: Generate namespace UUIDs
  hw/block/nvme: Separate read and write handlers
  hw/block/nvme: Combine nvme_write_zeroes() and nvme_write()
  hw/block/nvme: Add Commands Supported and Effects log
  block/nvme: Make ZNS-related definitions
  hw/block/nvme: Support Zoned Namespace Command Set
  hw/block/nvme: Introduce max active and open zone limits
  hw/block/nvme: Support Zone Descriptor Extensions
  hw/block/nvme: Add injection of Offline/Read-Only zones
  hw/block/nvme: Document zoned parameters in usage text

Niklas Cassel (2):
  hw/block/nvme: Add support for Namespace Types
  hw/block/nvme: Support allocated CNS command variants

 hw/block/nvme-ns.h|  108 +++-
 hw/block/nvme.h   |6 +
 include/block/nvme.h  |  201 +-
 hw/block/nvme-ns.c|  271 +++-
 hw/block/nvme.c   | 1432 ++---
 hw/block/trace-events |   32 +-
 6 files changed, 1927 insertions(+), 123 deletions(-)

-- 
2.28.0




[PATCH v11 10/13] hw/block/nvme: Introduce max active and open zone limits

2020-12-08 Thread Dmitry Fomichev
Add two module properties, "zoned.max_active" and "zoned.max_open"
to control the maximum number of zones that can be active or open.
Once these variables are set to non-default values, these limits are
checked during I/O and Too Many Active or Too Many Open command status
is returned if they are exceeded.

Signed-off-by: Hans Holmberg 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h| 41 +++
 hw/block/nvme-ns.c| 31 ++-
 hw/block/nvme.c   | 92 +++
 hw/block/trace-events |  2 +
 4 files changed, 164 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 388381dda0..7e1fd26909 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -33,6 +33,8 @@ typedef struct NvmeNamespaceParams {
 bool cross_zone_read;
 uint64_t zone_size_bs;
 uint64_t zone_cap_bs;
+uint32_t max_active_zones;
+uint32_t max_open_zones;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -54,6 +56,8 @@ typedef struct NvmeNamespace {
 uint64_tzone_size;
 uint64_tzone_capacity;
 uint32_tzone_size_log2;
+int32_t nr_open_zones;
+int32_t nr_active_zones;
 
 NvmeNamespaceParams params;
 
@@ -125,6 +129,43 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone)
st != NVME_ZONE_STATE_OFFLINE;
 }
 
+static inline void nvme_aor_inc_open(NvmeNamespace *ns)
+{
+assert(ns->nr_open_zones >= 0);
+if (ns->params.max_open_zones) {
+ns->nr_open_zones++;
+assert(ns->nr_open_zones <= ns->params.max_open_zones);
+}
+}
+
+static inline void nvme_aor_dec_open(NvmeNamespace *ns)
+{
+if (ns->params.max_open_zones) {
+assert(ns->nr_open_zones > 0);
+ns->nr_open_zones--;
+}
+assert(ns->nr_open_zones >= 0);
+}
+
+static inline void nvme_aor_inc_active(NvmeNamespace *ns)
+{
+assert(ns->nr_active_zones >= 0);
+if (ns->params.max_active_zones) {
+ns->nr_active_zones++;
+assert(ns->nr_active_zones <= ns->params.max_active_zones);
+}
+}
+
+static inline void nvme_aor_dec_active(NvmeNamespace *ns)
+{
+if (ns->params.max_active_zones) {
+assert(ns->nr_active_zones > 0);
+ns->nr_active_zones--;
+assert(ns->nr_active_zones >= ns->nr_open_zones);
+}
+assert(ns->nr_active_zones >= 0);
+}
+
 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_shutdown(NvmeNamespace *ns);
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 1df45bbe35..aaef69fb47 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -136,6 +136,21 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace 
*ns, Error **errp)
 ns->zone_size = zone_size / lbasz;
 ns->zone_capacity = zone_cap / lbasz;
 ns->num_zones = ns->size / lbasz / ns->zone_size;
+
+/* Do a few more sanity checks of ZNS properties */
+if (ns->params.max_open_zones > ns->num_zones) {
+error_setg(errp,
+   "max_open_zones value %u exceeds the number of zones %u",
+   ns->params.max_open_zones, ns->num_zones);
+return -1;
+}
+if (ns->params.max_active_zones > ns->num_zones) {
+error_setg(errp,
+   "max_active_zones value %u exceeds the number of zones %u",
+   ns->params.max_active_zones, ns->num_zones);
+return -1;
+}
+
 return 0;
 }
 
@@ -183,8 +198,8 @@ static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace 
*ns, int lba_index)
 id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
 
 /* MAR/MOR are zeroes-based, 0x means no limit */
-id_ns_z->mar = 0x;
-id_ns_z->mor = 0x;
+id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1);
+id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1);
 id_ns_z->zoc = 0;
 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
 
@@ -210,6 +225,7 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone 
*zone)
 trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
 nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
 }
+nvme_aor_inc_active(ns);
 QTAILQ_INSERT_HEAD(>closed_zones, zone, entry);
 } else {
 trace_pci_nvme_clear_ns_reset(state, zone->d.zslba);
@@ -226,16 +242,23 @@ static void nvme_zoned_ns_shutdown(NvmeNamespace *ns)
 
 QTAILQ_FOREACH_SAFE(zone, >closed_zones, entry, next) {
 QTAILQ_REMOVE(>closed_zones, zone, entry);
+nvme_aor_dec_active(ns);
 nvme_clear_zone(ns, zone);
 }
 QTAILQ_FOREACH_SAFE(zone, >imp_open_zones, entry, next) {
 

[PATCH v11 09/13] hw/block/nvme: Support Zoned Namespace Command Set

2020-12-08 Thread Dmitry Fomichev
The emulation code has been changed to advertise NVM Command Set when
"zoned" device property is not set (default) and Zoned Namespace
Command Set otherwise.

Define values and structures that are needed to support Zoned
Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator.
Define trace events where needed in newly introduced code.

In order to improve scalability, all open, closed and full zones
are organized in separate linked lists. Consequently, almost all
zone operations don't require scanning of the entire zone array
(which potentially can be quite large) - it is only necessary to
enumerate one or more zone lists.

Handlers for three new NVMe commands introduced in Zoned Namespace
Command Set specification are added, namely for Zone Management
Receive, Zone Management Send and Zone Append.

Device initialization code has been extended to create a proper
configuration for zoned operation using device properties.

Read/Write command handler is modified to only allow writes at the
write pointer if the namespace is zoned. For Zone Append command,
writes implicitly happen at the write pointer and the starting write
pointer value is returned as the result of the command. Write Zeroes
handler is modified to add zoned checks that are identical to those
done as a part of Write flow.

Subsequent commits in this series add ZDE support and checks for
active and open zone limits.

Signed-off-by: Niklas Cassel 
Signed-off-by: Hans Holmberg 
Signed-off-by: Ajay Joshi 
Signed-off-by: Chaitanya Kulkarni 
Signed-off-by: Matias Bjorling 
Signed-off-by: Aravind Ramesh 
Signed-off-by: Shin'ichiro Kawasaki 
Signed-off-by: Adam Manzanares 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h|  52 +++
 hw/block/nvme.h   |   6 +
 hw/block/nvme-ns.c| 165 +
 hw/block/nvme.c   | 804 +-
 hw/block/trace-events |  17 +
 5 files changed, 1036 insertions(+), 8 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index bdbc98c2ec..388381dda0 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -19,9 +19,20 @@
 #define NVME_NS(obj) \
 OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
 
+typedef struct NvmeZone {
+NvmeZoneDescr   d;
+uint64_tw_ptr;
+QTAILQ_ENTRY(NvmeZone) entry;
+} NvmeZone;
+
 typedef struct NvmeNamespaceParams {
 uint32_t nsid;
 QemuUUID uuid;
+
+bool zoned;
+bool cross_zone_read;
+uint64_t zone_size_bs;
+uint64_t zone_cap_bs;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -33,6 +44,17 @@ typedef struct NvmeNamespace {
 const uint32_t *iocs;
 uint8_t  csi;
 
+NvmeIdNsZoned   *id_ns_zoned;
+NvmeZone*zone_array;
+QTAILQ_HEAD(, NvmeZone) exp_open_zones;
+QTAILQ_HEAD(, NvmeZone) imp_open_zones;
+QTAILQ_HEAD(, NvmeZone) closed_zones;
+QTAILQ_HEAD(, NvmeZone) full_zones;
+uint32_tnum_zones;
+uint64_tzone_size;
+uint64_tzone_capacity;
+uint32_tzone_size_log2;
+
 NvmeNamespaceParams params;
 
 struct {
@@ -74,8 +96,38 @@ static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t 
lba)
 
 typedef struct NvmeCtrl NvmeCtrl;
 
+static inline enum NvmeZoneState nvme_get_zone_state(NvmeZone *zone)
+{
+return zone->d.zs >> 4;
+}
+
+static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState 
state)
+{
+zone->d.zs = state << 4;
+}
+
+static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone *zone)
+{
+return zone->d.zslba + ns->zone_size;
+}
+
+static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
+{
+return zone->d.zslba + zone->d.zcap;
+}
+
+static inline bool nvme_wp_is_valid(NvmeZone *zone)
+{
+uint8_t st = nvme_get_zone_state(zone);
+
+return st != NVME_ZONE_STATE_FULL &&
+   st != NVME_ZONE_STATE_READ_ONLY &&
+   st != NVME_ZONE_STATE_OFFLINE;
+}
+
 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_shutdown(NvmeNamespace *ns);
+void nvme_ns_cleanup(NvmeNamespace *ns);
 
 #endif /* NVME_NS_H */
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 574333caa3..b7fbcca39d 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -6,6 +6,9 @@
 
 #define NVME_MAX_NAMESPACES 256
 
+#define NVME_DEFAULT_ZONE_SIZE   (128 * MiB)
+#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
+
 typedef struct NvmeParams {
 char *serial;
 uint32_t num_queues; /* deprecated since 5.1 */
@@ -16,6 +19,7 @@ typedef struct NvmeParams {
 uint32_t aer_max_queued;
 uint8_t  mdts;
 bool use_intel_id;
+uint32_t zasl_bs;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
@@ -149,6 +153,8 @@ typedef struct NvmeCtrl {
 QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue;
 int aer_queued;
 
+uint8_t zasl;
+
 NvmeNamespace   namespa

[PATCH v11 13/13] hw/block/nvme: Document zoned parameters in usage text

2020-12-08 Thread Dmitry Fomichev
Added brief descriptions of the new device properties that are
now available to users to configure features of Zoned Namespace
Command Set in the emulator.

This patch is for documentation only, no functionality change.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme.c | 47 ++-
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index c2336bfd67..fbb69c82c6 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -9,7 +9,7 @@
  */
 
 /**
- * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
+ * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
  *
  *  https://nvmexpress.org/developers/nvme-specification/
  */
@@ -22,8 +22,9 @@
  *  [pmrdev=,] \
  *  max_ioqpairs=, \
  *  aerl=, aer_max_queued=, \
- *  mdts=
- *  -device nvme-ns,drive=,bus=bus_name,nsid=
+ *  mdts=,zoned.append_size_limit= \
+ *  -device nvme-ns,drive=,bus=,nsid=,\
+ *  zoned=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
@@ -41,14 +42,50 @@
  * ~~
  * - `aerl`
  *   The Asynchronous Event Request Limit (AERL). Indicates the maximum number
- *   of concurrently outstanding Asynchronous Event Request commands suppoert
+ *   of concurrently outstanding Asynchronous Event Request commands support
  *   by the controller. This is a 0's based value.
  *
  * - `aer_max_queued`
  *   This is the maximum number of events that the device will enqueue for
- *   completion when there are no oustanding AERs. When the maximum number of
+ *   completion when there are no outstanding AERs. When the maximum number of
  *   enqueued events are reached, subsequent events will be dropped.
  *
+ * - `zoned.append_size_limit`
+ *   The maximum I/O size in bytes that is allowed in Zone Append command.
+ *   The default is 128KiB. Since internally this this value is maintained as
+ *   ZASL = log2( / ), some values assigned
+ *   to this property may be rounded down and result in a lower maximum ZA
+ *   data size being in effect. By setting this property to 0, users can make
+ *   ZASL to be equal to MDTS. This property only affects zoned namespaces.
+ *
+ * Setting `zoned` to true selects Zoned Command Set at the namespace.
+ * In this case, the following namespace properties are available to configure
+ * zoned operation:
+ * zoned.zsze=
+ * The number may be followed by K, M, G as in kilo-, mega- or giga-.
+ *
+ * zoned.zcap=
+ * The value 0 (default) forces zone capacity to be the same as zone
+ * size. The value of this property may not exceed zone size.
+ *
+ * zoned.descr_ext_size=
+ * This value needs to be specified in 64B units. If it is zero,
+ * namespace(s) will not support zone descriptor extensions.
+ *
+ * zoned.max_active=
+ * The default value means there is no limit to the number of
+ * concurrently active zones.
+ *
+ * zoned.max_open=
+ * The default value means there is no limit to the number of
+ * concurrently open zones.
+ *
+ * zoned.offline_zones=
+ *
+ * zoned.rdonly_zones=
+ *
+ * zoned.cross_zone_read=
+ * Setting this property to true enables Read Across Zone Boundaries.
  */
 
 #include "qemu/osdep.h"
-- 
2.28.0




[PATCH v11 04/13] hw/block/nvme: Combine nvme_write_zeroes() and nvme_write()

2020-12-08 Thread Dmitry Fomichev
Move write processing to nvme_do_write() that now handles both WRITE
and WRITE ZEROES. Both nvme_write() and nvme_write_zeroes() become
inline helper functions.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
Acked-by: Klaus Jensen 
---
 hw/block/nvme.c   | 78 ---
 hw/block/trace-events |  1 -
 2 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 897c2d04e5..986917dabf 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1128,32 +1128,7 @@ invalid:
 return status | NVME_DNR;
 }
 
-static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
-{
-NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
-NvmeNamespace *ns = req->ns;
-uint64_t slba = le64_to_cpu(rw->slba);
-uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
-uint64_t offset = nvme_l2b(ns, slba);
-uint32_t count = nvme_l2b(ns, nlb);
-uint16_t status;
-
-trace_pci_nvme_write_zeroes(nvme_cid(req), nvme_nsid(ns), slba, nlb);
-
-status = nvme_check_bounds(n, ns, slba, nlb);
-if (status) {
-trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
-return status;
-}
-
-block_acct_start(blk_get_stats(req->ns->blkconf.blk), >acct, 0,
- BLOCK_ACCT_WRITE);
-req->aiocb = blk_aio_pwrite_zeroes(req->ns->blkconf.blk, offset, count,
-   BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
-return NVME_NO_COMPLETE;
-}
-
-static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool wrz)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
 NvmeNamespace *ns = req->ns;
@@ -1167,10 +1142,12 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
  nvme_nsid(ns), nlb, data_size, slba);
 
-status = nvme_check_mdts(n, data_size);
-if (status) {
-trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
-goto invalid;
+if (!wrz) {
+status = nvme_check_mdts(n, data_size);
+if (status) {
+trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+goto invalid;
+}
 }
 
 status = nvme_check_bounds(n, ns, slba, nlb);
@@ -1179,21 +1156,28 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest 
*req)
 goto invalid;
 }
 
-status = nvme_map_dptr(n, data_size, req);
-if (status) {
-goto invalid;
-}
-
 data_offset = nvme_l2b(ns, slba);
 
-block_acct_start(blk_get_stats(blk), >acct, data_size,
- BLOCK_ACCT_WRITE);
-if (req->qsg.sg) {
-req->aiocb = dma_blk_write(blk, >qsg, data_offset,
-   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+if (!wrz) {
+status = nvme_map_dptr(n, data_size, req);
+if (status) {
+goto invalid;
+}
+
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_WRITE);
+if (req->qsg.sg) {
+req->aiocb = dma_blk_write(blk, >qsg, data_offset,
+   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+} else {
+req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
+ nvme_rw_cb, req);
+}
 } else {
-req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
- nvme_rw_cb, req);
+block_acct_start(blk_get_stats(blk), >acct, 0, BLOCK_ACCT_WRITE);
+req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
+   BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
+   req);
 }
 return NVME_NO_COMPLETE;
 
@@ -1202,6 +1186,16 @@ invalid:
 return status | NVME_DNR;
 }
 
+static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
+{
+return nvme_do_write(n, req, false);
+}
+
+static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
+{
+return nvme_do_write(n, req, true);
+}
+
 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
 {
 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 6233f801e1..02a7c3044c 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -43,7 +43,6 @@ pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t 
opcode, const char *opna
 pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, 
uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 
0x%"PRIx64""
 pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, 
uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %&q

[PATCH v11 06/13] hw/block/nvme: Add support for Namespace Types

2020-12-08 Thread Dmitry Fomichev
From: Niklas Cassel 

Define the structures and constants required to implement
Namespace Types support.

Namespace Types introduce a new command set, "I/O Command Sets",
that allows the host to retrieve the command sets associated with
a namespace. Introduce support for the command set and enable
detection for the NVM Command Set.

The new workflows for identify commands rely heavily on zero-filled
identify structs. E.g., certain CNS commands are defined to return
a zero-filled identify struct when an inactive namespace NSID
is supplied.

Add a helper function in order to avoid code duplication when
reporting zero-filled identify structures.

Signed-off-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Keith Busch 
---
 hw/block/nvme-ns.h|   1 +
 include/block/nvme.h  |  64 ++
 hw/block/nvme-ns.c|   2 +
 hw/block/nvme.c   | 188 +++---
 hw/block/trace-events |   6 ++
 5 files changed, 217 insertions(+), 44 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index bdeaf1c0de..bdbc98c2ec 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -31,6 +31,7 @@ typedef struct NvmeNamespace {
 int64_t  size;
 NvmeIdNs id_ns;
 const uint32_t *iocs;
+uint8_t  csi;
 
 NvmeNamespaceParams params;
 
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 422c98a297..890977db4b 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -84,6 +84,7 @@ enum NvmeCapMask {
 
 enum NvmeCapCss {
 NVME_CAP_CSS_NVM= 1 << 0,
+NVME_CAP_CSS_CSI_SUPP   = 1 << 6,
 NVME_CAP_CSS_ADMIN_ONLY = 1 << 7,
 };
 
@@ -117,9 +118,25 @@ enum NvmeCcMask {
 
 enum NvmeCcCss {
 NVME_CC_CSS_NVM= 0x0,
+NVME_CC_CSS_CSI= 0x6,
 NVME_CC_CSS_ADMIN_ONLY = 0x7,
 };
 
+#define NVME_SET_CC_EN(cc, val) \
+(cc |= (uint32_t)((val) & CC_EN_MASK) << CC_EN_SHIFT)
+#define NVME_SET_CC_CSS(cc, val)\
+(cc |= (uint32_t)((val) & CC_CSS_MASK) << CC_CSS_SHIFT)
+#define NVME_SET_CC_MPS(cc, val)\
+(cc |= (uint32_t)((val) & CC_MPS_MASK) << CC_MPS_SHIFT)
+#define NVME_SET_CC_AMS(cc, val)\
+(cc |= (uint32_t)((val) & CC_AMS_MASK) << CC_AMS_SHIFT)
+#define NVME_SET_CC_SHN(cc, val)\
+(cc |= (uint32_t)((val) & CC_SHN_MASK) << CC_SHN_SHIFT)
+#define NVME_SET_CC_IOSQES(cc, val) \
+(cc |= (uint32_t)((val) & CC_IOSQES_MASK) << CC_IOSQES_SHIFT)
+#define NVME_SET_CC_IOCQES(cc, val) \
+(cc |= (uint32_t)((val) & CC_IOCQES_MASK) << CC_IOCQES_SHIFT)
+
 enum NvmeCstsShift {
 CSTS_RDY_SHIFT  = 0,
 CSTS_CFS_SHIFT  = 1,
@@ -534,8 +551,13 @@ typedef struct QEMU_PACKED NvmeIdentify {
 uint64_trsvd2[2];
 uint64_tprp1;
 uint64_tprp2;
-uint32_tcns;
-uint32_trsvd11[5];
+uint8_t cns;
+uint8_t rsvd10;
+uint16_tctrlid;
+uint16_tnvmsetid;
+uint8_t rsvd11;
+uint8_t csi;
+uint32_trsvd12[4];
 } NvmeIdentify;
 
 typedef struct QEMU_PACKED NvmeRwCmd {
@@ -656,6 +678,7 @@ enum NvmeStatusCodes {
 NVME_SGL_DESCR_TYPE_INVALID = 0x0011,
 NVME_INVALID_USE_OF_CMB = 0x0012,
 NVME_INVALID_PRP_OFFSET = 0x0013,
+NVME_CMD_SET_CMB_REJECTED   = 0x002b,
 NVME_LBA_RANGE  = 0x0080,
 NVME_CAP_EXCEEDED   = 0x0081,
 NVME_NS_NOT_READY   = 0x0082,
@@ -783,11 +806,15 @@ typedef struct QEMU_PACKED NvmePSD {
 
 #define NVME_IDENTIFY_DATA_SIZE 4096
 
-enum {
-NVME_ID_CNS_NS = 0x0,
-NVME_ID_CNS_CTRL   = 0x1,
-NVME_ID_CNS_NS_ACTIVE_LIST = 0x2,
-NVME_ID_CNS_NS_DESCR_LIST  = 0x3,
+enum NvmeIdCns {
+NVME_ID_CNS_NS= 0x00,
+NVME_ID_CNS_CTRL  = 0x01,
+NVME_ID_CNS_NS_ACTIVE_LIST= 0x02,
+NVME_ID_CNS_NS_DESCR_LIST = 0x03,
+NVME_ID_CNS_CS_NS = 0x05,
+NVME_ID_CNS_CS_CTRL   = 0x06,
+NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
+NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
 };
 
 typedef struct QEMU_PACKED NvmeIdCtrl {
@@ -938,6 +965,7 @@ enum NvmeFeatureIds {
 NVME_WRITE_ATOMICITY= 0xa,
 NVME_ASYNCHRONOUS_EVENT_CONF= 0xb,
 NVME_TIMESTAMP  = 0xe,
+NVME_COMMAND_SET_PROFILE= 0x19,
 NVME_SOFTWARE_PROGRESS_MARKER   = 0x80,
 NVME_FID_MAX= 0x100,
 };
@@ -1027,18 +1055,26 @@ typedef struct QEMU_PACKED NvmeIdNsDescr {
 uint8_t rsvd2[2];
 } NvmeIdNsDescr;
 
-enum {
-NVME_NIDT_EUI64_LEN =  8,
-NVME_NIDT_NGUID_LEN = 16,
-NVME_NIDT_UUID_LEN  = 16,
+enum NvmeNsIdentifierLength {
+NVME_NIDL_EUI64 = 8,
+NVME_NIDL_NGUID = 16,
+NVME_NIDL_UUID  = 16,
+NVME_NIDL_CSI   = 1,
 };
 
 enum NvmeNsIdentifierType {
-NVME_NIDT_EUI64 = 0x1,
-NVME_NIDT_NGUID = 0x2,
-NVME_NIDT_UUID  = 0x3,
+  

[PATCH v11 02/13] hw/block/nvme: Generate namespace UUIDs

2020-12-08 Thread Dmitry Fomichev
In NVMe 1.4, a namespace must report an ID descriptor of UUID type
if it doesn't support EUI64 or NGUID. Add a new namespace property,
"uuid", that provides the user the option to either specify the UUID
explicitly or have a UUID generated automatically every time a
namespace is initialized.

Suggested-by: Klaus Jensen 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Klaus Jensen 
Reviewed-by: Keith Busch 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h | 1 +
 hw/block/nvme-ns.c | 1 +
 hw/block/nvme.c| 9 +
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index ed3d7e65d5..aeca810fc7 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -21,6 +21,7 @@
 
 typedef struct NvmeNamespaceParams {
 uint32_t nsid;
+QemuUUID uuid;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 9b95e2ed33..6349aa30be 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -152,6 +152,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
 static Property nvme_ns_props[] = {
 DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
 DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
+DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 10acb7e7f0..a30fe75620 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1662,6 +1662,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 
 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
 {
+NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
 uint32_t nsid = le32_to_cpu(c->nsid);
 uint8_t list[NVME_IDENTIFY_DATA_SIZE];
@@ -1681,7 +1682,8 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_NSID | NVME_DNR;
 }
 
-if (unlikely(!nvme_ns(n, nsid))) {
+ns = nvme_ns(n, nsid);
+if (unlikely(!ns)) {
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
@@ -1690,12 +1692,11 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl 
*n, NvmeRequest *req)
 /*
  * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data
  * structure, a Namespace UUID (nidt = 0x3) must be reported in the
- * Namespace Identification Descriptor. Add a very basic Namespace UUID
- * here.
+ * Namespace Identification Descriptor. Add the namespace UUID here.
  */
 ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID;
 ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN;
-stl_be_p(_descrs->uuid.v, nsid);
+memcpy(_descrs->uuid.v, ns->params.uuid.data, NVME_NIDT_UUID_LEN);
 
 return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE,
 DMA_DIRECTION_FROM_DEVICE, req);
-- 
2.28.0




[PATCH v11 05/13] hw/block/nvme: Add Commands Supported and Effects log

2020-12-08 Thread Dmitry Fomichev
This log page becomes necessary to implement to allow checking for
Zone Append command support in Zoned Namespace Command Set.

This commit adds the code to report this log page for NVM Command
Set only. The parts that are specific to zoned operation will be
added later in the series.

All incoming admin and i/o commands are now only processed if their
corresponding support bits are set in this log. This provides an
easy way to control what commands to support and what not to
depending on set CC.CSS.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h|  1 +
 include/block/nvme.h  | 19 +
 hw/block/nvme.c   | 96 +++
 hw/block/trace-events |  1 +
 4 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index aeca810fc7..bdeaf1c0de 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -30,6 +30,7 @@ typedef struct NvmeNamespace {
 int32_t  bootindex;
 int64_t  size;
 NvmeIdNs id_ns;
+const uint32_t *iocs;
 
 NvmeNamespaceParams params;
 
diff --git a/include/block/nvme.h b/include/block/nvme.h
index e95ff6ca9b..422c98a297 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -746,10 +746,27 @@ enum NvmeSmartWarn {
 NVME_SMART_FAILED_VOLATILE_MEDIA  = 1 << 4,
 };
 
+typedef struct NvmeEffectsLog {
+uint32_tacs[256];
+uint32_tiocs[256];
+uint8_t resv[2048];
+} NvmeEffectsLog;
+
+enum {
+NVME_CMD_EFF_CSUPP  = 1 << 0,
+NVME_CMD_EFF_LBCC   = 1 << 1,
+NVME_CMD_EFF_NCC= 1 << 2,
+NVME_CMD_EFF_NIC= 1 << 3,
+NVME_CMD_EFF_CCC= 1 << 4,
+NVME_CMD_EFF_CSE_MASK   = 3 << 16,
+NVME_CMD_EFF_UUID_SEL   = 1 << 19,
+};
+
 enum NvmeLogIdentifier {
 NVME_LOG_ERROR_INFO = 0x01,
 NVME_LOG_SMART_INFO = 0x02,
 NVME_LOG_FW_SLOT_INFO   = 0x03,
+NVME_LOG_CMD_EFFECTS= 0x05,
 };
 
 typedef struct QEMU_PACKED NvmePSD {
@@ -862,6 +879,7 @@ enum NvmeIdCtrlFrmw {
 
 enum NvmeIdCtrlLpa {
 NVME_LPA_NS_SMART = 1 << 0,
+NVME_LPA_CSE  = 1 << 1,
 NVME_LPA_EXTENDED = 1 << 2,
 };
 
@@ -1070,6 +1088,7 @@ static inline void _nvme_check_size(void)
 QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
 QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
+QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096);
 QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
 QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
 QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16);
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 986917dabf..0b047f2069 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -112,6 +112,28 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE,
 };
 
+static const uint32_t nvme_cse_acs[256] = {
+[NVME_ADM_CMD_DELETE_SQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_CREATE_SQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_DELETE_CQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_CREATE_CQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_ABORT]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
+};
+
+static const uint32_t nvme_cse_iocs_none[256];
+
+static const uint32_t nvme_cse_iocs_nvm[256] = {
+[NVME_CMD_FLUSH]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_WRITE]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
+};
+
 static void nvme_process_sq(void *opaque);
 
 static uint16_t nvme_cid(NvmeRequest *req)
@@ -1203,10 +1225,6 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
   req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
 
-if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_ADMIN_ONLY) {
-return NVME_INVALID_OPCODE | NVME_DNR;
-}
-
 if (!nvme_nsid_valid(n, nsid)) {
 return NVME_INVALID_NSID | NVME_DNR;
 }
@@ -1216,6 +1234,11 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
+if (!(req->ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
+trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
+return NVME_INVALID_OPCODE | NVME_DNR;
+}
+
 switch (req->cmd.opcode) {
 case NVME_CMD_FLUSH:
 return nvme_flush(n, req);
@@ -1228,

[PATCH v11 11/13] hw/block/nvme: Support Zone Descriptor Extensions

2020-12-08 Thread Dmitry Fomichev
Zone Descriptor Extension is a label that can be assigned to a zone.
It can be set to an Empty zone and it stays assigned until the zone
is reset.

This commit adds a new optional module property,
"zoned.descr_ext_size". Its value must be a multiple of 64 bytes.
If this value is non-zero, it becomes possible to assign extensions
of that size to any Empty zones. The default value for this property
is 0, therefore setting extensions is disabled by default.

Signed-off-by: Hans Holmberg 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Klaus Jensen 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h|  8 +++
 hw/block/nvme-ns.c| 25 ++--
 hw/block/nvme.c   | 53 +--
 hw/block/trace-events |  2 ++
 4 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 7e1fd26909..f8f3c28c36 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -35,6 +35,7 @@ typedef struct NvmeNamespaceParams {
 uint64_t zone_cap_bs;
 uint32_t max_active_zones;
 uint32_t max_open_zones;
+uint32_t zd_extension_size;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -56,6 +57,7 @@ typedef struct NvmeNamespace {
 uint64_tzone_size;
 uint64_tzone_capacity;
 uint32_tzone_size_log2;
+uint8_t *zd_extensions;
 int32_t nr_open_zones;
 int32_t nr_active_zones;
 
@@ -129,6 +131,12 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone)
st != NVME_ZONE_STATE_OFFLINE;
 }
 
+static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns,
+ uint32_t zone_idx)
+{
+return >zd_extensions[zone_idx * ns->params.zd_extension_size];
+}
+
 static inline void nvme_aor_inc_open(NvmeNamespace *ns)
 {
 assert(ns->nr_open_zones >= 0);
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index aaef69fb47..c5a7bafcf7 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -151,6 +151,18 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace 
*ns, Error **errp)
 return -1;
 }
 
+if (ns->params.zd_extension_size) {
+if (ns->params.zd_extension_size & 0x3f) {
+error_setg(errp,
+"zone descriptor extension size must be a multiple of 64B");
+return -1;
+}
+if ((ns->params.zd_extension_size >> 6) > 0xff) {
+error_setg(errp, "zone descriptor extension size is too large");
+return -1;
+}
+}
+
 return 0;
 }
 
@@ -162,6 +174,10 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
 int i;
 
 ns->zone_array = g_new0(NvmeZone, ns->num_zones);
+if (ns->params.zd_extension_size) {
+ns->zd_extensions = g_malloc0(ns->params.zd_extension_size *
+  ns->num_zones);
+}
 
 QTAILQ_INIT(>exp_open_zones);
 QTAILQ_INIT(>imp_open_zones);
@@ -204,7 +220,8 @@ static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace 
*ns, int lba_index)
 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
 
 id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size);
-id_ns_z->lbafe[lba_index].zdes = 0;
+id_ns_z->lbafe[lba_index].zdes =
+ns->params.zd_extension_size >> 6; /* Units of 64B */
 
 ns->csi = NVME_CSI_ZONED;
 ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size);
@@ -220,7 +237,8 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone 
*zone)
 
 zone->w_ptr = zone->d.wp;
 state = nvme_get_zone_state(zone);
-if (zone->d.wp != zone->d.zslba) {
+if (zone->d.wp != zone->d.zslba ||
+(zone->d.za & NVME_ZA_ZD_EXT_VALID)) {
 if (state != NVME_ZONE_STATE_CLOSED) {
 trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
 nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
@@ -316,6 +334,7 @@ void nvme_ns_cleanup(NvmeNamespace *ns)
 if (ns->params.zoned) {
 g_free(ns->id_ns_zoned);
 g_free(ns->zone_array);
+g_free(ns->zd_extensions);
 }
 }
 
@@ -347,6 +366,8 @@ static Property nvme_ns_props[] = {
params.max_active_zones, 0),
 DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace,
params.max_open_zones, 0),
+DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
+   params.zd_extension_size, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 8b97b713a3..c2336bfd67 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1715,6 +1715,25 @@ static uint16_t nvme_offline_zone(NvmeNamespace *ns, 
NvmeZone *zone,
 }
 }
 
+static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
+{
+ui

[PATCH v11 08/13] block/nvme: Make ZNS-related definitions

2020-12-08 Thread Dmitry Fomichev
Define values and structures that are needed to support Zoned
Namespace Command Set (NVMe TP 4053).

Signed-off-by: Dmitry Fomichev 
---
 include/block/nvme.h | 114 ++-
 1 file changed, 113 insertions(+), 1 deletion(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index 29d826ab19..a9165402d6 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -489,6 +489,9 @@ enum NvmeIoCommands {
 NVME_CMD_COMPARE= 0x05,
 NVME_CMD_WRITE_ZEROES   = 0x08,
 NVME_CMD_DSM= 0x09,
+NVME_CMD_ZONE_MGMT_SEND = 0x79,
+NVME_CMD_ZONE_MGMT_RECV = 0x7a,
+NVME_CMD_ZONE_APPEND= 0x7d,
 };
 
 typedef struct QEMU_PACKED NvmeDeleteQ {
@@ -648,9 +651,13 @@ typedef struct QEMU_PACKED NvmeAerResult {
 uint8_t resv;
 } NvmeAerResult;
 
+typedef struct QEMU_PACKED NvmeZonedResult {
+uint64_t slba;
+} NvmeZonedResult;
+
 typedef struct QEMU_PACKED NvmeCqe {
 uint32_tresult;
-uint32_trsvd;
+uint32_tdw1;
 uint16_tsq_head;
 uint16_tsq_id;
 uint16_tcid;
@@ -679,6 +686,7 @@ enum NvmeStatusCodes {
 NVME_INVALID_USE_OF_CMB = 0x0012,
 NVME_INVALID_PRP_OFFSET = 0x0013,
 NVME_CMD_SET_CMB_REJECTED   = 0x002b,
+NVME_INVALID_CMD_SET= 0x002c,
 NVME_LBA_RANGE  = 0x0080,
 NVME_CAP_EXCEEDED   = 0x0081,
 NVME_NS_NOT_READY   = 0x0082,
@@ -703,6 +711,14 @@ enum NvmeStatusCodes {
 NVME_CONFLICTING_ATTRS  = 0x0180,
 NVME_INVALID_PROT_INFO  = 0x0181,
 NVME_WRITE_TO_RO= 0x0182,
+NVME_ZONE_BOUNDARY_ERROR= 0x01b8,
+NVME_ZONE_FULL  = 0x01b9,
+NVME_ZONE_READ_ONLY = 0x01ba,
+NVME_ZONE_OFFLINE   = 0x01bb,
+NVME_ZONE_INVALID_WRITE = 0x01bc,
+NVME_ZONE_TOO_MANY_ACTIVE   = 0x01bd,
+NVME_ZONE_TOO_MANY_OPEN = 0x01be,
+NVME_ZONE_INVAL_TRANSITION  = 0x01bf,
 NVME_WRITE_FAULT= 0x0280,
 NVME_UNRECOVERED_READ   = 0x0281,
 NVME_E2E_GUARD_ERROR= 0x0282,
@@ -888,6 +904,11 @@ typedef struct QEMU_PACKED NvmeIdCtrl {
 uint8_t vs[1024];
 } NvmeIdCtrl;
 
+typedef struct NvmeIdCtrlZoned {
+uint8_t zasl;
+uint8_t rsvd1[4095];
+} NvmeIdCtrlZoned;
+
 enum NvmeIdCtrlOacs {
 NVME_OACS_SECURITY  = 1 << 0,
 NVME_OACS_FORMAT= 1 << 1,
@@ -1016,6 +1037,12 @@ typedef struct QEMU_PACKED NvmeLBAF {
 uint8_t rp;
 } NvmeLBAF;
 
+typedef struct QEMU_PACKED NvmeLBAFE {
+uint64_tzsze;
+uint8_t zdes;
+uint8_t rsvd9[7];
+} NvmeLBAFE;
+
 #define NVME_NSID_BROADCAST 0x
 
 typedef struct QEMU_PACKED NvmeIdNs {
@@ -1075,10 +1102,24 @@ enum NvmeNsIdentifierType {
 
 enum NvmeCsi {
 NVME_CSI_NVM= 0x00,
+NVME_CSI_ZONED  = 0x02,
 };
 
 #define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi)))
 
+typedef struct QEMU_PACKED NvmeIdNsZoned {
+uint16_tzoc;
+uint16_tozcs;
+uint32_tmar;
+uint32_tmor;
+uint32_trrl;
+uint32_tfrl;
+uint8_t rsvd20[2796];
+NvmeLBAFE   lbafe[16];
+uint8_t rsvd3072[768];
+uint8_t vs[256];
+} NvmeIdNsZoned;
+
 /*Deallocate Logical Block Features*/
 #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)   ((dlfeat) & 0x10)
 #define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat)((dlfeat) & 0x08)
@@ -,10 +1152,76 @@ enum NvmeIdNsDps {
 DPS_FIRST_EIGHT = 8,
 };
 
+enum NvmeZoneAttr {
+NVME_ZA_FINISHED_BY_CTLR = 1 << 0,
+NVME_ZA_FINISH_RECOMMENDED   = 1 << 1,
+NVME_ZA_RESET_RECOMMENDED= 1 << 2,
+NVME_ZA_ZD_EXT_VALID = 1 << 7,
+};
+
+typedef struct QEMU_PACKED NvmeZoneReportHeader {
+uint64_tnr_zones;
+uint8_t rsvd[56];
+} NvmeZoneReportHeader;
+
+enum NvmeZoneReceiveAction {
+NVME_ZONE_REPORT = 0,
+NVME_ZONE_REPORT_EXTENDED= 1,
+};
+
+enum NvmeZoneReportType {
+NVME_ZONE_REPORT_ALL = 0,
+NVME_ZONE_REPORT_EMPTY   = 1,
+NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2,
+NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3,
+NVME_ZONE_REPORT_CLOSED  = 4,
+NVME_ZONE_REPORT_FULL= 5,
+NVME_ZONE_REPORT_READ_ONLY   = 6,
+NVME_ZONE_REPORT_OFFLINE = 7,
+};
+
+enum NvmeZoneType {
+NVME_ZONE_TYPE_RESERVED  = 0x00,
+NVME_ZONE_TYPE_SEQ_WRITE = 0x02,
+};
+
+enum NvmeZoneSendAction {
+NVME_ZONE_ACTION_RSD = 0x00,
+NVME_ZONE_ACTION_CLOSE   = 0x01,
+NVME_ZONE_ACTION_FINISH  = 0x02,
+NVME_ZONE_ACTION_OPEN= 0x03,
+NVME_ZONE_ACTION_RESET   = 0x04,
+NVME_ZONE_ACTION_OFFLINE = 0x05,
+NVME_ZONE_ACTION_SET_ZD_EXT  = 0x10,
+};
+
+typedef struct QEMU_PACKED NvmeZoneDescr {
+uint8_t zt;
+uint8_t zs;
+uint8_t za

[PATCH v11 07/13] hw/block/nvme: Support allocated CNS command variants

2020-12-08 Thread Dmitry Fomichev
From: Niklas Cassel 

Many CNS commands have "allocated" command variants. These include
a namespace as long as it is allocated, that is a namespace is
included regardless if it is active (attached) or not.

While these commands are optional (they are mandatory for controllers
supporting the namespace attachment command), our QEMU implementation
is more complete by actually providing support for these CNS values.

However, since our QEMU model currently does not support the namespace
attachment command, these new allocated CNS commands will return the
same result as the active CNS command variants.

The reason for not hooking up this command completely is because the
NVMe specification requires the namespace management command to be
supported if the namespace attachment command is supported.

Signed-off-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Keith Busch 
---
 include/block/nvme.h | 20 
 hw/block/nvme.c  |  8 
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index 890977db4b..29d826ab19 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -807,14 +807,18 @@ typedef struct QEMU_PACKED NvmePSD {
 #define NVME_IDENTIFY_DATA_SIZE 4096
 
 enum NvmeIdCns {
-NVME_ID_CNS_NS= 0x00,
-NVME_ID_CNS_CTRL  = 0x01,
-NVME_ID_CNS_NS_ACTIVE_LIST= 0x02,
-NVME_ID_CNS_NS_DESCR_LIST = 0x03,
-NVME_ID_CNS_CS_NS = 0x05,
-NVME_ID_CNS_CS_CTRL   = 0x06,
-NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
-NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
+NVME_ID_CNS_NS= 0x00,
+NVME_ID_CNS_CTRL  = 0x01,
+NVME_ID_CNS_NS_ACTIVE_LIST= 0x02,
+NVME_ID_CNS_NS_DESCR_LIST = 0x03,
+NVME_ID_CNS_CS_NS = 0x05,
+NVME_ID_CNS_CS_CTRL   = 0x06,
+NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
+NVME_ID_CNS_NS_PRESENT_LIST   = 0x10,
+NVME_ID_CNS_NS_PRESENT= 0x11,
+NVME_ID_CNS_CS_NS_PRESENT_LIST= 0x1a,
+NVME_ID_CNS_CS_NS_PRESENT = 0x1b,
+NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
 };
 
 typedef struct QEMU_PACKED NvmeIdCtrl {
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 16eed37533..7035896649 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1901,16 +1901,24 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest 
*req)
 
 switch (le32_to_cpu(c->cns)) {
 case NVME_ID_CNS_NS:
+ /* fall through */
+case NVME_ID_CNS_NS_PRESENT:
 return nvme_identify_ns(n, req);
 case NVME_ID_CNS_CS_NS:
+ /* fall through */
+case NVME_ID_CNS_CS_NS_PRESENT:
 return nvme_identify_ns_csi(n, req);
 case NVME_ID_CNS_CTRL:
 return nvme_identify_ctrl(n, req);
 case NVME_ID_CNS_CS_CTRL:
 return nvme_identify_ctrl_csi(n, req);
 case NVME_ID_CNS_NS_ACTIVE_LIST:
+ /* fall through */
+case NVME_ID_CNS_NS_PRESENT_LIST:
 return nvme_identify_nslist(n, req);
 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
+ /* fall through */
+case NVME_ID_CNS_CS_NS_PRESENT_LIST:
 return nvme_identify_nslist_csi(n, req);
 case NVME_ID_CNS_NS_DESCR_LIST:
 return nvme_identify_ns_descr_list(n, req);
-- 
2.28.0




[PATCH v11 03/13] hw/block/nvme: Separate read and write handlers

2020-12-08 Thread Dmitry Fomichev
The majority of code in nvme_rw() is becoming read- or write-specific.
Move these parts to two separate handlers, nvme_read() and nvme_write()
to make the code more readable and to remove multiple is_write checks
that has been present in the i/o path.

This is a refactoring patch, no change in functionality.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
Acked-by: Klaus Jensen 
---
 hw/block/nvme.c   | 107 --
 hw/block/trace-events |   3 +-
 2 files changed, 74 insertions(+), 36 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index a30fe75620..897c2d04e5 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1073,6 +1073,61 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
+static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
+NvmeNamespace *ns = req->ns;
+uint64_t slba = le64_to_cpu(rw->slba);
+uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
+uint64_t data_size = nvme_l2b(ns, nlb);
+uint64_t data_offset;
+BlockBackend *blk = ns->blkconf.blk;
+uint16_t status;
+
+trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, data_size, slba);
+
+status = nvme_check_mdts(n, data_size);
+if (status) {
+trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+goto invalid;
+}
+
+status = nvme_check_bounds(n, ns, slba, nlb);
+if (status) {
+trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
+goto invalid;
+}
+
+status = nvme_map_dptr(n, data_size, req);
+if (status) {
+goto invalid;
+}
+
+if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
+status = nvme_check_dulbe(ns, slba, nlb);
+if (status) {
+goto invalid;
+}
+}
+
+data_offset = nvme_l2b(ns, slba);
+
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_READ);
+if (req->qsg.sg) {
+req->aiocb = dma_blk_read(blk, >qsg, data_offset,
+  BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+} else {
+req->aiocb = blk_aio_preadv(blk, data_offset, >iov, 0,
+nvme_rw_cb, req);
+}
+return NVME_NO_COMPLETE;
+
+invalid:
+block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
+return status | NVME_DNR;
+}
+
 static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
@@ -1098,22 +1153,19 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
-static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
 NvmeNamespace *ns = req->ns;
-uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
 uint64_t slba = le64_to_cpu(rw->slba);
-
+uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
 uint64_t data_size = nvme_l2b(ns, nlb);
-uint64_t data_offset = nvme_l2b(ns, slba);
-enum BlockAcctType acct = req->cmd.opcode == NVME_CMD_WRITE ?
-BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
+uint64_t data_offset;
 BlockBackend *blk = ns->blkconf.blk;
 uint16_t status;
 
-trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode),
-  nvme_nsid(ns), nlb, data_size, slba);
+trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
+ nvme_nsid(ns), nlb, data_size, slba);
 
 status = nvme_check_mdts(n, data_size);
 if (status) {
@@ -1127,43 +1179,27 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
 goto invalid;
 }
 
-if (acct == BLOCK_ACCT_READ) {
-if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
-status = nvme_check_dulbe(ns, slba, nlb);
-if (status) {
-goto invalid;
-}
-}
-}
-
 status = nvme_map_dptr(n, data_size, req);
 if (status) {
 goto invalid;
 }
 
-block_acct_start(blk_get_stats(blk), >acct, data_size, acct);
+data_offset = nvme_l2b(ns, slba);
+
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_WRITE);
 if (req->qsg.sg) {
-if (acct == BLOCK_ACCT_WRITE) {
-req->aiocb = dma_blk_write(blk, >qsg, data_offset,
-   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
-} else {
-req->aiocb = dma_blk_read(blk, >qsg, data_offset,
-  BDRV_SECTOR_SIZE, nvme_rw_cb, req);
-}
+req->aiocb = dma_blk_write(blk, >qsg, data_offset,
+   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
 } else {
-if (acct == BLOCK_ACCT_WRITE) {
-req->aiocb 

Re: [PATCH v9 08/12] hw/block/nvme: Support Zoned Namespace Command Set

2020-12-08 Thread Dmitry Fomichev
Hi Klaus,

Thank you for your review! Please see replies below...


On Thu, 2020-11-12 at 20:36 +0100, Klaus Jensen wrote:
> Hi Dmitry,
> 
> I know you posted v10, but my comments should be relevant to that as
> well.
> 
> On Nov  5 11:53, Dmitry Fomichev wrote:
> > The emulation code has been changed to advertise NVM Command Set when
> > "zoned" device property is not set (default) and Zoned Namespace
> > Command Set otherwise.
> > 
> > Define values and structures that are needed to support Zoned
> > Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator.
> > Define trace events where needed in newly introduced code.
> > 
> > In order to improve scalability, all open, closed and full zones
> > are organized in separate linked lists. Consequently, almost all
> > zone operations don't require scanning of the entire zone array
> > (which potentially can be quite large) - it is only necessary to
> > enumerate one or more zone lists.
> > 
> > Handlers for three new NVMe commands introduced in Zoned Namespace
> > Command Set specification are added, namely for Zone Management
> > Receive, Zone Management Send and Zone Append.
> > 
> > Device initialization code has been extended to create a proper
> > configuration for zoned operation using device properties.
> > 
> > Read/Write command handler is modified to only allow writes at the
> > write pointer if the namespace is zoned. For Zone Append command,
> > writes implicitly happen at the write pointer and the starting write
> > pointer value is returned as the result of the command. Write Zeroes
> > handler is modified to add zoned checks that are identical to those
> > done as a part of Write flow.
> > 
> > Subsequent commits in this series add ZDE support and checks for
> > active and open zone limits.
> > 
> > Signed-off-by: Niklas Cassel 
> > Signed-off-by: Hans Holmberg 
> > Signed-off-by: Ajay Joshi 
> > Signed-off-by: Chaitanya Kulkarni 
> > Signed-off-by: Matias Bjorling 
> > Signed-off-by: Aravind Ramesh 
> > Signed-off-by: Shin'ichiro Kawasaki 
> > Signed-off-by: Adam Manzanares 
> > Signed-off-by: Dmitry Fomichev 
> > Reviewed-by: Niklas Cassel 
> > ---
> >  hw/block/nvme-ns.h|  54 +++
> >  hw/block/nvme.h   |   8 +
> >  hw/block/nvme-ns.c| 173 
> >  hw/block/nvme.c   | 971 +-
> >  hw/block/trace-events |  18 +-
> >  5 files changed, 1209 insertions(+), 15 deletions(-)
> > 
> > diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> > index 2d9cd29d07..d2631ff5a3 100644
> > --- a/hw/block/nvme-ns.h
> > +++ b/hw/block/nvme-ns.h
> > @@ -19,9 +19,20 @@
> >  #define NVME_NS(obj) \
> >  OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
> >  
> > +typedef struct NvmeZone {
> > +NvmeZoneDescr   d;
> > +uint64_tw_ptr;
> > +QTAILQ_ENTRY(NvmeZone) entry;
> > +} NvmeZone;
> > +
> >  typedef struct NvmeNamespaceParams {
> >  uint32_t nsid;
> >  QemuUUID uuid;
> > +
> > +bool zoned;
> > +bool cross_zone_read;
> > +uint64_t zone_size_bs;
> > +uint64_t zone_cap_bs;
> >  } NvmeNamespaceParams;
> >  
> >  typedef struct NvmeNamespace {
> > @@ -34,6 +45,18 @@ typedef struct NvmeNamespace {
> >  bool attached;
> >  uint8_t  csi;
> >  
> > +NvmeIdNsZoned   *id_ns_zoned;
> > +NvmeZone*zone_array;
> > +QTAILQ_HEAD(, NvmeZone) exp_open_zones;
> > +QTAILQ_HEAD(, NvmeZone) imp_open_zones;
> > +QTAILQ_HEAD(, NvmeZone) closed_zones;
> > +QTAILQ_HEAD(, NvmeZone) full_zones;
> > +uint32_tnum_zones;
> > +uint64_tzone_size;
> > +uint64_tzone_capacity;
> > +uint64_tzone_array_size;
> > +uint32_tzone_size_log2;
> > +
> >  NvmeNamespaceParams params;
> >  } NvmeNamespace;
> >  
> > @@ -71,8 +94,39 @@ static inline size_t nvme_l2b(NvmeNamespace *ns, 
> > uint64_t lba)
> >  
> >  typedef struct NvmeCtrl NvmeCtrl;
> >  
> > +static inline uint8_t nvme_get_zone_state(NvmeZone *zone)
> 
> This can (should?) return the NvmeZoneState enum.

Ok, good idea.

> 
> > +{
> > +return zone->d.zs >> 4;
> > +}
> > +
> > +static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState 
> > state)
> > +{
> > +zone->d.zs = state << 4;
> > +}
> 

[PATCH v10 10/12] hw/block/nvme: Support Zone Descriptor Extensions

2020-11-06 Thread Dmitry Fomichev
Zone Descriptor Extension is a label that can be assigned to a zone.
It can be set to an Empty zone and it stays assigned until the zone
is reset.

This commit adds a new optional module property,
"zoned.descr_ext_size". Its value must be a multiple of 64 bytes.
If this value is non-zero, it becomes possible to assign extensions
of that size to any Empty zones. The default value for this property
is 0, therefore setting extensions is disabled by default.

Signed-off-by: Hans Holmberg 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Klaus Jensen 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h|  8 +++
 hw/block/nvme-ns.c| 25 ++--
 hw/block/nvme.c   | 54 +--
 hw/block/trace-events |  2 ++
 4 files changed, 85 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 421bab0a57..50a6a0e1ac 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -35,6 +35,7 @@ typedef struct NvmeNamespaceParams {
 uint64_t zone_cap_bs;
 uint32_t max_active_zones;
 uint32_t max_open_zones;
+uint32_t zd_extension_size;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -58,6 +59,7 @@ typedef struct NvmeNamespace {
 uint64_tzone_capacity;
 uint64_tzone_array_size;
 uint32_tzone_size_log2;
+uint8_t *zd_extensions;
 int32_t nr_open_zones;
 int32_t nr_active_zones;
 
@@ -127,6 +129,12 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone)
st != NVME_ZONE_STATE_OFFLINE;
 }
 
+static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns,
+ uint32_t zone_idx)
+{
+return >zd_extensions[zone_idx * ns->params.zd_extension_size];
+}
+
 static inline void nvme_aor_inc_open(NvmeNamespace *ns)
 {
 assert(ns->nr_open_zones >= 0);
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 2e45838c15..85dc73cf06 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -133,6 +133,18 @@ static int nvme_calc_zone_geometry(NvmeNamespace *ns, 
Error **errp)
 return -1;
 }
 
+if (ns->params.zd_extension_size) {
+if (ns->params.zd_extension_size & 0x3f) {
+error_setg(errp,
+"zone descriptor extension size must be a multiple of 64B");
+return -1;
+}
+if ((ns->params.zd_extension_size >> 6) > 0xff) {
+error_setg(errp, "zone descriptor extension size is too large");
+return -1;
+}
+}
+
 return 0;
 }
 
@@ -144,6 +156,10 @@ static void nvme_init_zone_state(NvmeNamespace *ns)
 int i;
 
 ns->zone_array = g_malloc0(ns->zone_array_size);
+if (ns->params.zd_extension_size) {
+ns->zd_extensions = g_malloc0(ns->params.zd_extension_size *
+  ns->num_zones);
+}
 
 QTAILQ_INIT(>exp_open_zones);
 QTAILQ_INIT(>imp_open_zones);
@@ -186,7 +202,8 @@ static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace 
*ns, int lba_index,
 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
 
 id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size);
-id_ns_z->lbafe[lba_index].zdes = 0;
+id_ns_z->lbafe[lba_index].zdes =
+ns->params.zd_extension_size >> 6; /* Units of 64B */
 
 ns->csi = NVME_CSI_ZONED;
 ns->id_ns.nsze = cpu_to_le64(ns->zone_size * ns->num_zones);
@@ -204,7 +221,8 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone 
*zone)
 
 zone->w_ptr = zone->d.wp;
 state = nvme_get_zone_state(zone);
-if (zone->d.wp != zone->d.zslba) {
+if (zone->d.wp != zone->d.zslba ||
+(zone->d.za & NVME_ZA_ZD_EXT_VALID)) {
 if (state != NVME_ZONE_STATE_CLOSED) {
 trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
 nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
@@ -301,6 +319,7 @@ void nvme_ns_cleanup(NvmeNamespace *ns)
 if (ns->params.zoned) {
 g_free(ns->id_ns_zoned);
 g_free(ns->zone_array);
+g_free(ns->zd_extensions);
 }
 }
 
@@ -332,6 +351,8 @@ static Property nvme_ns_props[] = {
params.max_active_zones, 0),
 DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace,
params.max_open_zones, 0),
+DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
+   params.zd_extension_size, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index c6b3a5dcf7..e82e3be821 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1703,6 +1703,26 @@ static uint16_t nvme_offline_zone(NvmeNamespace *ns, 
NvmeZone *zone,
 return NVME_ZONE_INVAL_TRANSITION;
 }
 
+static uint16_t nvme_set_zd_ext(NvmeNamespace *n

[PATCH v10 09/12] hw/block/nvme: Introduce max active and open zone limits

2020-11-06 Thread Dmitry Fomichev
Add two module properties, "zoned.max_active" and "zoned.max_open"
to control the maximum number of zones that can be active or open.
Once these variables are set to non-default values, these limits are
checked during I/O and Too Many Active or Too Many Open command status
is returned if they are exceeded.

Signed-off-by: Hans Holmberg 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h| 41 +++
 hw/block/nvme-ns.c| 30 +-
 hw/block/nvme.c   | 94 +++
 hw/block/trace-events |  2 +
 4 files changed, 165 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index d2631ff5a3..421bab0a57 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -33,6 +33,8 @@ typedef struct NvmeNamespaceParams {
 bool cross_zone_read;
 uint64_t zone_size_bs;
 uint64_t zone_cap_bs;
+uint32_t max_active_zones;
+uint32_t max_open_zones;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -56,6 +58,8 @@ typedef struct NvmeNamespace {
 uint64_tzone_capacity;
 uint64_tzone_array_size;
 uint32_tzone_size_log2;
+int32_t nr_open_zones;
+int32_t nr_active_zones;
 
 NvmeNamespaceParams params;
 } NvmeNamespace;
@@ -123,6 +127,43 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone)
st != NVME_ZONE_STATE_OFFLINE;
 }
 
+static inline void nvme_aor_inc_open(NvmeNamespace *ns)
+{
+assert(ns->nr_open_zones >= 0);
+if (ns->params.max_open_zones) {
+ns->nr_open_zones++;
+assert(ns->nr_open_zones <= ns->params.max_open_zones);
+}
+}
+
+static inline void nvme_aor_dec_open(NvmeNamespace *ns)
+{
+if (ns->params.max_open_zones) {
+assert(ns->nr_open_zones > 0);
+ns->nr_open_zones--;
+}
+assert(ns->nr_open_zones >= 0);
+}
+
+static inline void nvme_aor_inc_active(NvmeNamespace *ns)
+{
+assert(ns->nr_active_zones >= 0);
+if (ns->params.max_active_zones) {
+ns->nr_active_zones++;
+assert(ns->nr_active_zones <= ns->params.max_active_zones);
+}
+}
+
+static inline void nvme_aor_dec_active(NvmeNamespace *ns)
+{
+if (ns->params.max_active_zones) {
+assert(ns->nr_active_zones > 0);
+ns->nr_active_zones--;
+assert(ns->nr_active_zones >= ns->nr_open_zones);
+}
+assert(ns->nr_active_zones >= 0);
+}
+
 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_flush(NvmeNamespace *ns);
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index e6db7f7d3b..2e45838c15 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -119,6 +119,20 @@ static int nvme_calc_zone_geometry(NvmeNamespace *ns, 
Error **errp)
 ns->zone_size_log2 = 63 - clz64(ns->zone_size);
 }
 
+/* Make sure that the values of all ZNS properties are sane */
+if (ns->params.max_open_zones > nz) {
+error_setg(errp,
+   "max_open_zones value %u exceeds the number of zones %u",
+   ns->params.max_open_zones, nz);
+return -1;
+}
+if (ns->params.max_active_zones > nz) {
+error_setg(errp,
+   "max_active_zones value %u exceeds the number of zones %u",
+   ns->params.max_active_zones, nz);
+return -1;
+}
+
 return 0;
 }
 
@@ -166,8 +180,8 @@ static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace 
*ns, int lba_index,
 id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
 
 /* MAR/MOR are zeroes-based, 0x means no limit */
-id_ns_z->mar = 0x;
-id_ns_z->mor = 0x;
+id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1);
+id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1);
 id_ns_z->zoc = 0;
 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
 
@@ -195,6 +209,7 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone 
*zone)
 trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
 nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
 }
+nvme_aor_inc_active(ns);
 QTAILQ_INSERT_HEAD(>closed_zones, zone, entry);
 } else {
 trace_pci_nvme_clear_ns_reset(state, zone->d.zslba);
@@ -211,16 +226,23 @@ static void nvme_zoned_ns_shutdown(NvmeNamespace *ns)
 
 QTAILQ_FOREACH_SAFE(zone, >closed_zones, entry, next) {
 QTAILQ_REMOVE(>closed_zones, zone, entry);
+nvme_aor_dec_active(ns);
 nvme_clear_zone(ns, zone);
 }
 QTAILQ_FOREACH_SAFE(zone, >imp_open_zones, entry, next) {
 QTAILQ_REMOVE(>imp_open_zones, zone, entry);
+nvme_aor_dec_open(ns);
+nvme_aor_dec_ac

[PATCH v10 08/12] hw/block/nvme: Support Zoned Namespace Command Set

2020-11-06 Thread Dmitry Fomichev
The emulation code has been changed to advertise NVM Command Set when
"zoned" device property is not set (default) and Zoned Namespace
Command Set otherwise.

Define values and structures that are needed to support Zoned
Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator.
Define trace events where needed in newly introduced code.

In order to improve scalability, all open, closed and full zones
are organized in separate linked lists. Consequently, almost all
zone operations don't require scanning of the entire zone array
(which potentially can be quite large) - it is only necessary to
enumerate one or more zone lists.

Handlers for three new NVMe commands introduced in Zoned Namespace
Command Set specification are added, namely for Zone Management
Receive, Zone Management Send and Zone Append.

Device initialization code has been extended to create a proper
configuration for zoned operation using device properties.

Read/Write command handler is modified to only allow writes at the
write pointer if the namespace is zoned. For Zone Append command,
writes implicitly happen at the write pointer and the starting write
pointer value is returned as the result of the command. Write Zeroes
handler is modified to add zoned checks that are identical to those
done as a part of Write flow.

Subsequent commits in this series add ZDE support and checks for
active and open zone limits.

Signed-off-by: Niklas Cassel 
Signed-off-by: Hans Holmberg 
Signed-off-by: Ajay Joshi 
Signed-off-by: Chaitanya Kulkarni 
Signed-off-by: Matias Bjorling 
Signed-off-by: Aravind Ramesh 
Signed-off-by: Shin'ichiro Kawasaki 
Signed-off-by: Adam Manzanares 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h|  54 +++
 hw/block/nvme.h   |   8 +
 hw/block/nvme-ns.c| 173 
 hw/block/nvme.c   | 972 +-
 hw/block/trace-events |  18 +-
 5 files changed, 1210 insertions(+), 15 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 2d9cd29d07..d2631ff5a3 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -19,9 +19,20 @@
 #define NVME_NS(obj) \
 OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
 
+typedef struct NvmeZone {
+NvmeZoneDescr   d;
+uint64_tw_ptr;
+QTAILQ_ENTRY(NvmeZone) entry;
+} NvmeZone;
+
 typedef struct NvmeNamespaceParams {
 uint32_t nsid;
 QemuUUID uuid;
+
+bool zoned;
+bool cross_zone_read;
+uint64_t zone_size_bs;
+uint64_t zone_cap_bs;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -34,6 +45,18 @@ typedef struct NvmeNamespace {
 bool attached;
 uint8_t  csi;
 
+NvmeIdNsZoned   *id_ns_zoned;
+NvmeZone*zone_array;
+QTAILQ_HEAD(, NvmeZone) exp_open_zones;
+QTAILQ_HEAD(, NvmeZone) imp_open_zones;
+QTAILQ_HEAD(, NvmeZone) closed_zones;
+QTAILQ_HEAD(, NvmeZone) full_zones;
+uint32_tnum_zones;
+uint64_tzone_size;
+uint64_tzone_capacity;
+uint64_tzone_array_size;
+uint32_tzone_size_log2;
+
 NvmeNamespaceParams params;
 } NvmeNamespace;
 
@@ -71,8 +94,39 @@ static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t 
lba)
 
 typedef struct NvmeCtrl NvmeCtrl;
 
+static inline uint8_t nvme_get_zone_state(NvmeZone *zone)
+{
+return zone->d.zs >> 4;
+}
+
+static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState 
state)
+{
+zone->d.zs = state << 4;
+}
+
+static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone *zone)
+{
+return zone->d.zslba + ns->zone_size;
+}
+
+static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
+{
+return zone->d.zslba + zone->d.zcap;
+}
+
+static inline bool nvme_wp_is_valid(NvmeZone *zone)
+{
+uint8_t st = nvme_get_zone_state(zone);
+
+return st != NVME_ZONE_STATE_FULL &&
+   st != NVME_ZONE_STATE_READ_ONLY &&
+   st != NVME_ZONE_STATE_OFFLINE;
+}
+
 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_flush(NvmeNamespace *ns);
+void nvme_ns_shutdown(NvmeNamespace *ns);
+void nvme_ns_cleanup(NvmeNamespace *ns);
 
 #endif /* NVME_NS_H */
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index e080a2318a..4cb0615128 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -6,6 +6,9 @@
 
 #define NVME_MAX_NAMESPACES 256
 
+#define NVME_DEFAULT_ZONE_SIZE   (128 * MiB)
+#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
+
 typedef struct NvmeParams {
 char *serial;
 uint32_t num_queues; /* deprecated since 5.1 */
@@ -16,6 +19,7 @@ typedef struct NvmeParams {
 uint32_t aer_max_queued;
 uint8_t  mdts;
 bool use_intel_id;
+uint32_t zasl_bs;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
@@ -28,6 +32,8 @@ typedef struct NvmeRequest {
 struct NvmeNamespace*ns;
 BlockAIOCB 

[PATCH v10 07/12] block/nvme: Make ZNS-related definitions

2020-11-06 Thread Dmitry Fomichev
Define values and structures that are needed to support Zoned
Namespace Command Set (NVMe TP 4053).

Signed-off-by: Dmitry Fomichev 
---
 include/block/nvme.h | 114 ++-
 1 file changed, 113 insertions(+), 1 deletion(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index 394db19022..752623b4f9 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -489,6 +489,9 @@ enum NvmeIoCommands {
 NVME_CMD_COMPARE= 0x05,
 NVME_CMD_WRITE_ZEROES   = 0x08,
 NVME_CMD_DSM= 0x09,
+NVME_CMD_ZONE_MGMT_SEND = 0x79,
+NVME_CMD_ZONE_MGMT_RECV = 0x7a,
+NVME_CMD_ZONE_APPEND= 0x7d,
 };
 
 typedef struct QEMU_PACKED NvmeDeleteQ {
@@ -648,9 +651,13 @@ typedef struct QEMU_PACKED NvmeAerResult {
 uint8_t resv;
 } NvmeAerResult;
 
+typedef struct QEMU_PACKED NvmeZonedResult {
+uint64_t slba;
+} NvmeZonedResult;
+
 typedef struct QEMU_PACKED NvmeCqe {
 uint32_tresult;
-uint32_trsvd;
+uint32_tdw1;
 uint16_tsq_head;
 uint16_tsq_id;
 uint16_tcid;
@@ -679,6 +686,7 @@ enum NvmeStatusCodes {
 NVME_INVALID_USE_OF_CMB = 0x0012,
 NVME_INVALID_PRP_OFFSET = 0x0013,
 NVME_CMD_SET_CMB_REJECTED   = 0x002b,
+NVME_INVALID_CMD_SET= 0x002c,
 NVME_LBA_RANGE  = 0x0080,
 NVME_CAP_EXCEEDED   = 0x0081,
 NVME_NS_NOT_READY   = 0x0082,
@@ -703,6 +711,14 @@ enum NvmeStatusCodes {
 NVME_CONFLICTING_ATTRS  = 0x0180,
 NVME_INVALID_PROT_INFO  = 0x0181,
 NVME_WRITE_TO_RO= 0x0182,
+NVME_ZONE_BOUNDARY_ERROR= 0x01b8,
+NVME_ZONE_FULL  = 0x01b9,
+NVME_ZONE_READ_ONLY = 0x01ba,
+NVME_ZONE_OFFLINE   = 0x01bb,
+NVME_ZONE_INVALID_WRITE = 0x01bc,
+NVME_ZONE_TOO_MANY_ACTIVE   = 0x01bd,
+NVME_ZONE_TOO_MANY_OPEN = 0x01be,
+NVME_ZONE_INVAL_TRANSITION  = 0x01bf,
 NVME_WRITE_FAULT= 0x0280,
 NVME_UNRECOVERED_READ   = 0x0281,
 NVME_E2E_GUARD_ERROR= 0x0282,
@@ -887,6 +903,11 @@ typedef struct QEMU_PACKED NvmeIdCtrl {
 uint8_t vs[1024];
 } NvmeIdCtrl;
 
+typedef struct NvmeIdCtrlZoned {
+uint8_t zasl;
+uint8_t rsvd1[4095];
+} NvmeIdCtrlZoned;
+
 enum NvmeIdCtrlOacs {
 NVME_OACS_SECURITY  = 1 << 0,
 NVME_OACS_FORMAT= 1 << 1,
@@ -1012,6 +1033,12 @@ typedef struct QEMU_PACKED NvmeLBAF {
 uint8_t rp;
 } NvmeLBAF;
 
+typedef struct QEMU_PACKED NvmeLBAFE {
+uint64_tzsze;
+uint8_t zdes;
+uint8_t rsvd9[7];
+} NvmeLBAFE;
+
 #define NVME_NSID_BROADCAST 0x
 
 typedef struct QEMU_PACKED NvmeIdNs {
@@ -1066,10 +1093,24 @@ enum NvmeNsIdentifierType {
 
 enum NvmeCsi {
 NVME_CSI_NVM= 0x00,
+NVME_CSI_ZONED  = 0x02,
 };
 
 #define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi)))
 
+typedef struct QEMU_PACKED NvmeIdNsZoned {
+uint16_tzoc;
+uint16_tozcs;
+uint32_tmar;
+uint32_tmor;
+uint32_trrl;
+uint32_tfrl;
+uint8_t rsvd20[2796];
+NvmeLBAFE   lbafe[16];
+uint8_t rsvd3072[768];
+uint8_t vs[256];
+} NvmeIdNsZoned;
+
 /*Deallocate Logical Block Features*/
 #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)   ((dlfeat) & 0x10)
 #define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat)((dlfeat) & 0x08)
@@ -1101,10 +1142,76 @@ enum NvmeIdNsDps {
 DPS_FIRST_EIGHT = 8,
 };
 
+enum NvmeZoneAttr {
+NVME_ZA_FINISHED_BY_CTLR = 1 << 0,
+NVME_ZA_FINISH_RECOMMENDED   = 1 << 1,
+NVME_ZA_RESET_RECOMMENDED= 1 << 2,
+NVME_ZA_ZD_EXT_VALID = 1 << 7,
+};
+
+typedef struct QEMU_PACKED NvmeZoneReportHeader {
+uint64_tnr_zones;
+uint8_t rsvd[56];
+} NvmeZoneReportHeader;
+
+enum NvmeZoneReceiveAction {
+NVME_ZONE_REPORT = 0,
+NVME_ZONE_REPORT_EXTENDED= 1,
+};
+
+enum NvmeZoneReportType {
+NVME_ZONE_REPORT_ALL = 0,
+NVME_ZONE_REPORT_EMPTY   = 1,
+NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2,
+NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3,
+NVME_ZONE_REPORT_CLOSED  = 4,
+NVME_ZONE_REPORT_FULL= 5,
+NVME_ZONE_REPORT_READ_ONLY   = 6,
+NVME_ZONE_REPORT_OFFLINE = 7,
+};
+
+enum NvmeZoneType {
+NVME_ZONE_TYPE_RESERVED  = 0x00,
+NVME_ZONE_TYPE_SEQ_WRITE = 0x02,
+};
+
+enum NvmeZoneSendAction {
+NVME_ZONE_ACTION_RSD = 0x00,
+NVME_ZONE_ACTION_CLOSE   = 0x01,
+NVME_ZONE_ACTION_FINISH  = 0x02,
+NVME_ZONE_ACTION_OPEN= 0x03,
+NVME_ZONE_ACTION_RESET   = 0x04,
+NVME_ZONE_ACTION_OFFLINE = 0x05,
+NVME_ZONE_ACTION_SET_ZD_EXT  = 0x10,
+};
+
+typedef struct QEMU_PACKED NvmeZoneDescr {
+uint8_t zt;
+uint8_t zs;
+uint8_t za

[PATCH v10 05/12] hw/block/nvme: Add support for Namespace Types

2020-11-06 Thread Dmitry Fomichev
From: Niklas Cassel 

Define the structures and constants required to implement
Namespace Types support.

Namespace Types introduce a new command set, "I/O Command Sets",
that allows the host to retrieve the command sets associated with
a namespace. Introduce support for the command set and enable
detection for the NVM Command Set.

The new workflows for identify commands rely heavily on zero-filled
identify structs. E.g., certain CNS commands are defined to return
a zero-filled identify struct when an inactive namespace NSID
is supplied.

Add a helper function in order to avoid code duplication when
reporting zero-filled identify structures.

Signed-off-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Keith Busch 
---
 hw/block/nvme-ns.h|   1 +
 include/block/nvme.h  |  66 +++
 hw/block/nvme-ns.c|   2 +
 hw/block/nvme.c   | 188 +++---
 hw/block/trace-events |   7 ++
 5 files changed, 219 insertions(+), 45 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index a38071884a..d795e44bab 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -31,6 +31,7 @@ typedef struct NvmeNamespace {
 int64_t  size;
 NvmeIdNs id_ns;
 const uint32_t *iocs;
+uint8_t  csi;
 
 NvmeNamespaceParams params;
 } NvmeNamespace;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index f62cc90d49..af23514713 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -84,6 +84,7 @@ enum NvmeCapMask {
 
 enum NvmeCapCss {
 NVME_CAP_CSS_NVM= 1 << 0,
+NVME_CAP_CSS_CSI_SUPP   = 1 << 6,
 NVME_CAP_CSS_ADMIN_ONLY = 1 << 7,
 };
 
@@ -117,9 +118,25 @@ enum NvmeCcMask {
 
 enum NvmeCcCss {
 NVME_CC_CSS_NVM= 0x0,
+NVME_CC_CSS_CSI= 0x6,
 NVME_CC_CSS_ADMIN_ONLY = 0x7,
 };
 
+#define NVME_SET_CC_EN(cc, val) \
+(cc |= (uint32_t)((val) & CC_EN_MASK) << CC_EN_SHIFT)
+#define NVME_SET_CC_CSS(cc, val)\
+(cc |= (uint32_t)((val) & CC_CSS_MASK) << CC_CSS_SHIFT)
+#define NVME_SET_CC_MPS(cc, val)\
+(cc |= (uint32_t)((val) & CC_MPS_MASK) << CC_MPS_SHIFT)
+#define NVME_SET_CC_AMS(cc, val)\
+(cc |= (uint32_t)((val) & CC_AMS_MASK) << CC_AMS_SHIFT)
+#define NVME_SET_CC_SHN(cc, val)\
+(cc |= (uint32_t)((val) & CC_SHN_MASK) << CC_SHN_SHIFT)
+#define NVME_SET_CC_IOSQES(cc, val) \
+(cc |= (uint32_t)((val) & CC_IOSQES_MASK) << CC_IOSQES_SHIFT)
+#define NVME_SET_CC_IOCQES(cc, val) \
+(cc |= (uint32_t)((val) & CC_IOCQES_MASK) << CC_IOCQES_SHIFT)
+
 enum NvmeCstsShift {
 CSTS_RDY_SHIFT  = 0,
 CSTS_CFS_SHIFT  = 1,
@@ -534,8 +551,13 @@ typedef struct QEMU_PACKED NvmeIdentify {
 uint64_trsvd2[2];
 uint64_tprp1;
 uint64_tprp2;
-uint32_tcns;
-uint32_trsvd11[5];
+uint8_t cns;
+uint8_t rsvd10;
+uint16_tctrlid;
+uint16_tnvmsetid;
+uint8_t rsvd11;
+uint8_t csi;
+uint32_trsvd12[4];
 } NvmeIdentify;
 
 typedef struct QEMU_PACKED NvmeRwCmd {
@@ -656,6 +678,7 @@ enum NvmeStatusCodes {
 NVME_SGL_DESCR_TYPE_INVALID = 0x0011,
 NVME_INVALID_USE_OF_CMB = 0x0012,
 NVME_INVALID_PRP_OFFSET = 0x0013,
+NVME_CMD_SET_CMB_REJECTED   = 0x002b,
 NVME_LBA_RANGE  = 0x0080,
 NVME_CAP_EXCEEDED   = 0x0081,
 NVME_NS_NOT_READY   = 0x0082,
@@ -782,11 +805,15 @@ typedef struct QEMU_PACKED NvmePSD {
 
 #define NVME_IDENTIFY_DATA_SIZE 4096
 
-enum {
-NVME_ID_CNS_NS = 0x0,
-NVME_ID_CNS_CTRL   = 0x1,
-NVME_ID_CNS_NS_ACTIVE_LIST = 0x2,
-NVME_ID_CNS_NS_DESCR_LIST  = 0x3,
+enum NvmeIdCns {
+NVME_ID_CNS_NS= 0x00,
+NVME_ID_CNS_CTRL  = 0x01,
+NVME_ID_CNS_NS_ACTIVE_LIST= 0x02,
+NVME_ID_CNS_NS_DESCR_LIST = 0x03,
+NVME_ID_CNS_CS_NS = 0x05,
+NVME_ID_CNS_CS_CTRL   = 0x06,
+NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
+NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
 };
 
 typedef struct QEMU_PACKED NvmeIdCtrl {
@@ -934,6 +961,7 @@ enum NvmeFeatureIds {
 NVME_WRITE_ATOMICITY= 0xa,
 NVME_ASYNCHRONOUS_EVENT_CONF= 0xb,
 NVME_TIMESTAMP  = 0xe,
+NVME_COMMAND_SET_PROFILE= 0x19,
 NVME_SOFTWARE_PROGRESS_MARKER   = 0x80,
 NVME_FID_MAX= 0x100,
 };
@@ -1018,18 +1046,26 @@ typedef struct QEMU_PACKED NvmeIdNsDescr {
 uint8_t rsvd2[2];
 } NvmeIdNsDescr;
 
-enum {
-NVME_NIDT_EUI64_LEN =  8,
-NVME_NIDT_NGUID_LEN = 16,
-NVME_NIDT_UUID_LEN  = 16,
+enum NvmeNsIdentifierLength {
+NVME_NIDL_EUI64 = 8,
+NVME_NIDL_NGUID = 16,
+NVME_NIDL_UUID  = 16,
+NVME_NIDL_CSI   = 1,
 };
 
 enum NvmeNsIdentifierType {
-NVME_NIDT_EUI64 = 0x1,
-NVME_NIDT_NGUID = 0x2,
-NVME_

[PATCH v10 04/12] hw/block/nvme: Merge nvme_write_zeroes() with nvme_write()

2020-11-06 Thread Dmitry Fomichev
nvme_write() now handles WRITE, WRITE ZEROES and ZONE_APPEND.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
Acked-by: Klaus Jensen 
---
 hw/block/nvme.c   | 72 +--
 hw/block/trace-events |  1 -
 2 files changed, 28 insertions(+), 45 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 770e42a066..48adbe84f5 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1001,32 +1001,7 @@ invalid:
 return status | NVME_DNR;
 }
 
-static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
-{
-NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
-NvmeNamespace *ns = req->ns;
-uint64_t slba = le64_to_cpu(rw->slba);
-uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
-uint64_t offset = nvme_l2b(ns, slba);
-uint32_t count = nvme_l2b(ns, nlb);
-uint16_t status;
-
-trace_pci_nvme_write_zeroes(nvme_cid(req), nvme_nsid(ns), slba, nlb);
-
-status = nvme_check_bounds(n, ns, slba, nlb);
-if (status) {
-trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
-return status;
-}
-
-block_acct_start(blk_get_stats(req->ns->blkconf.blk), >acct, 0,
- BLOCK_ACCT_WRITE);
-req->aiocb = blk_aio_pwrite_zeroes(req->ns->blkconf.blk, offset, count,
-   BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
-return NVME_NO_COMPLETE;
-}
-
-static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req, bool wrz)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
 NvmeNamespace *ns = req->ns;
@@ -1040,10 +1015,12 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
  nvme_nsid(ns), nlb, data_size, slba);
 
-status = nvme_check_mdts(n, data_size);
-if (status) {
-trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
-goto invalid;
+if (!wrz) {
+status = nvme_check_mdts(n, data_size);
+if (status) {
+trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+goto invalid;
+}
 }
 
 status = nvme_check_bounds(n, ns, slba, nlb);
@@ -1052,21 +1029,28 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest 
*req)
 goto invalid;
 }
 
-status = nvme_map_dptr(n, data_size, req);
-if (status) {
-goto invalid;
-}
-
 data_offset = nvme_l2b(ns, slba);
 
-block_acct_start(blk_get_stats(blk), >acct, data_size,
- BLOCK_ACCT_WRITE);
-if (req->qsg.sg) {
-req->aiocb = dma_blk_write(blk, >qsg, data_offset,
-   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+if (!wrz) {
+status = nvme_map_dptr(n, data_size, req);
+if (status) {
+goto invalid;
+}
+
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_WRITE);
+if (req->qsg.sg) {
+req->aiocb = dma_blk_write(blk, >qsg, data_offset,
+   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+} else {
+req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
+ nvme_rw_cb, req);
+}
 } else {
-req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
- nvme_rw_cb, req);
+block_acct_start(blk_get_stats(blk), >acct, 0, BLOCK_ACCT_WRITE);
+req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
+   BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
+   req);
 }
 return NVME_NO_COMPLETE;
 
@@ -1100,9 +1084,9 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
 case NVME_CMD_FLUSH:
 return nvme_flush(n, req);
 case NVME_CMD_WRITE_ZEROES:
-return nvme_write_zeroes(n, req);
+return nvme_write(n, req, true);
 case NVME_CMD_WRITE:
-return nvme_write(n, req);
+return nvme_write(n, req, false);
 case NVME_CMD_READ:
 return nvme_read(n, req);
 default:
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 540c600931..e67e96c2b5 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -43,7 +43,6 @@ pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t 
opcode, const char *opna
 pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, 
uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 
0x%"PRIx64""
 pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, 
uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb 
%"PRIu32" count %"PRIu64" lba 0x%"P

[PATCH v10 02/12] hw/block/nvme: Generate namespace UUIDs

2020-11-06 Thread Dmitry Fomichev
In NVMe 1.4, a namespace must report an ID descriptor of UUID type
if it doesn't support EUI64 or NGUID. Add a new namespace property,
"uuid", that provides the user the option to either specify the UUID
explicitly or have a UUID generated automatically every time a
namespace is initialized.

Suggested-by: Klaus Jensen 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Klaus Jensen 
Reviewed-by: Keith Busch 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h | 1 +
 hw/block/nvme-ns.c | 1 +
 hw/block/nvme.c| 9 +
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index ea8c2f785d..a38071884a 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -21,6 +21,7 @@
 
 typedef struct NvmeNamespaceParams {
 uint32_t nsid;
+QemuUUID uuid;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index b69cdaf27e..de735eb9f3 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -129,6 +129,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
 static Property nvme_ns_props[] = {
 DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
 DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
+DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 702f7cc2e3..ed3f38f01d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1564,6 +1564,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 
 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
 {
+NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
 uint32_t nsid = le32_to_cpu(c->nsid);
 uint8_t list[NVME_IDENTIFY_DATA_SIZE];
@@ -1583,7 +1584,8 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_NSID | NVME_DNR;
 }
 
-if (unlikely(!nvme_ns(n, nsid))) {
+ns = nvme_ns(n, nsid);
+if (unlikely(!ns)) {
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
@@ -1592,12 +1594,11 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl 
*n, NvmeRequest *req)
 /*
  * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data
  * structure, a Namespace UUID (nidt = 0x3) must be reported in the
- * Namespace Identification Descriptor. Add a very basic Namespace UUID
- * here.
+ * Namespace Identification Descriptor. Add the namespace UUID here.
  */
 ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID;
 ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN;
-stl_be_p(_descrs->uuid.v, nsid);
+memcpy(_descrs->uuid.v, ns->params.uuid.data, NVME_NIDT_UUID_LEN);
 
 return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE,
 DMA_DIRECTION_FROM_DEVICE, req);
-- 
2.21.0




[PATCH v10 11/12] hw/block/nvme: Add injection of Offline/Read-Only zones

2020-11-06 Thread Dmitry Fomichev
ZNS specification defines two zone conditions for the zones that no
longer can function properly, possibly because of flash wear or other
internal fault. It is useful to be able to "inject" a small number of
such zones for testing purposes.

This commit defines two optional device properties, "offline_zones"
and "rdonly_zones". Users can assign non-zero values to these variables
to specify the number of zones to be initialized as Offline or
Read-Only. The actual number of injected zones may be smaller than the
requested amount - Read-Only and Offline counts are expected to be much
smaller than the total number of zones on a drive.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h |  2 ++
 hw/block/nvme-ns.c | 52 ++
 2 files changed, 54 insertions(+)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 50a6a0e1ac..b30478e5d7 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -36,6 +36,8 @@ typedef struct NvmeNamespaceParams {
 uint32_t max_active_zones;
 uint32_t max_open_zones;
 uint32_t zd_extension_size;
+uint32_t nr_offline_zones;
+uint32_t nr_rdonly_zones;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 85dc73cf06..5e4a6705cd 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -21,6 +21,7 @@
 #include "sysemu/sysemu.h"
 #include "sysemu/block-backend.h"
 #include "qapi/error.h"
+#include "crypto/random.h"
 
 #include "hw/qdev-properties.h"
 #include "hw/qdev-core.h"
@@ -145,6 +146,20 @@ static int nvme_calc_zone_geometry(NvmeNamespace *ns, 
Error **errp)
 }
 }
 
+if (ns->params.max_open_zones < nz) {
+if (ns->params.nr_offline_zones > nz - ns->params.max_open_zones) {
+error_setg(errp, "offline_zones value %u is too large",
+ns->params.nr_offline_zones);
+return -1;
+}
+if (ns->params.nr_rdonly_zones >
+nz - ns->params.max_open_zones - ns->params.nr_offline_zones) {
+error_setg(errp, "rdonly_zones value %u is too large",
+ns->params.nr_rdonly_zones);
+return -1;
+}
+}
+
 return 0;
 }
 
@@ -153,7 +168,9 @@ static void nvme_init_zone_state(NvmeNamespace *ns)
 uint64_t start = 0, zone_size = ns->zone_size;
 uint64_t capacity = ns->num_zones * zone_size;
 NvmeZone *zone;
+uint32_t rnd;
 int i;
+uint16_t zs;
 
 ns->zone_array = g_malloc0(ns->zone_array_size);
 if (ns->params.zd_extension_size) {
@@ -180,6 +197,37 @@ static void nvme_init_zone_state(NvmeNamespace *ns)
 zone->w_ptr = start;
 start += zone_size;
 }
+
+/* If required, make some zones Offline or Read Only */
+
+for (i = 0; i < ns->params.nr_offline_zones; i++) {
+do {
+qcrypto_random_bytes(, sizeof(rnd), NULL);
+rnd %= ns->num_zones;
+} while (rnd < ns->params.max_open_zones);
+zone = >zone_array[rnd];
+zs = nvme_get_zone_state(zone);
+if (zs != NVME_ZONE_STATE_OFFLINE) {
+nvme_set_zone_state(zone, NVME_ZONE_STATE_OFFLINE);
+} else {
+i--;
+}
+}
+
+for (i = 0; i < ns->params.nr_rdonly_zones; i++) {
+do {
+qcrypto_random_bytes(, sizeof(rnd), NULL);
+rnd %= ns->num_zones;
+} while (rnd < ns->params.max_open_zones);
+zone = >zone_array[rnd];
+zs = nvme_get_zone_state(zone);
+if (zs != NVME_ZONE_STATE_OFFLINE &&
+zs != NVME_ZONE_STATE_READ_ONLY) {
+nvme_set_zone_state(zone, NVME_ZONE_STATE_READ_ONLY);
+} else {
+i--;
+}
+}
 }
 
 static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace *ns, int lba_index,
@@ -353,6 +401,10 @@ static Property nvme_ns_props[] = {
params.max_open_zones, 0),
 DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
params.zd_extension_size, 0),
+DEFINE_PROP_UINT32("zoned.offline_zones", NvmeNamespace,
+   params.nr_offline_zones, 0),
+DEFINE_PROP_UINT32("zoned.rdonly_zones", NvmeNamespace,
+   params.nr_rdonly_zones, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
-- 
2.21.0




[PATCH v10 01/12] hw/block/nvme: Add Commands Supported and Effects log

2020-11-06 Thread Dmitry Fomichev
This log page becomes necessary to implement to allow checking for
Zone Append command support in Zoned Namespace Command Set.

This commit adds the code to report this log page for NVM Command
Set only. The parts that are specific to zoned operation will be
added later in the series.

All incoming admin and i/o commands are now only processed if their
corresponding support bits are set in this log. This provides an
easy way to control what commands to support and what not to
depending on set CC.CSS.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h|  1 +
 include/block/nvme.h  | 19 +
 hw/block/nvme.c   | 96 +++
 hw/block/trace-events |  1 +
 4 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 83734f4606..ea8c2f785d 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -29,6 +29,7 @@ typedef struct NvmeNamespace {
 int32_t  bootindex;
 int64_t  size;
 NvmeIdNs id_ns;
+const uint32_t *iocs;
 
 NvmeNamespaceParams params;
 } NvmeNamespace;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 8a46d9cf01..f62cc90d49 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -745,10 +745,27 @@ enum NvmeSmartWarn {
 NVME_SMART_FAILED_VOLATILE_MEDIA  = 1 << 4,
 };
 
+typedef struct NvmeEffectsLog {
+uint32_tacs[256];
+uint32_tiocs[256];
+uint8_t resv[2048];
+} NvmeEffectsLog;
+
+enum {
+NVME_CMD_EFF_CSUPP  = 1 << 0,
+NVME_CMD_EFF_LBCC   = 1 << 1,
+NVME_CMD_EFF_NCC= 1 << 2,
+NVME_CMD_EFF_NIC= 1 << 3,
+NVME_CMD_EFF_CCC= 1 << 4,
+NVME_CMD_EFF_CSE_MASK   = 3 << 16,
+NVME_CMD_EFF_UUID_SEL   = 1 << 19,
+};
+
 enum NvmeLogIdentifier {
 NVME_LOG_ERROR_INFO = 0x01,
 NVME_LOG_SMART_INFO = 0x02,
 NVME_LOG_FW_SLOT_INFO   = 0x03,
+NVME_LOG_CMD_EFFECTS= 0x05,
 };
 
 typedef struct QEMU_PACKED NvmePSD {
@@ -861,6 +878,7 @@ enum NvmeIdCtrlFrmw {
 
 enum NvmeIdCtrlLpa {
 NVME_LPA_NS_SMART = 1 << 0,
+NVME_LPA_CSE  = 1 << 1,
 NVME_LPA_EXTENDED = 1 << 2,
 };
 
@@ -1060,6 +1078,7 @@ static inline void _nvme_check_size(void)
 QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
 QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
+QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096);
 QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
 QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
 QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16);
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 065e763e4f..702f7cc2e3 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -111,6 +111,28 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE,
 };
 
+static const uint32_t nvme_cse_acs[256] = {
+[NVME_ADM_CMD_DELETE_SQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_CREATE_SQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_DELETE_CQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_CREATE_CQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_ABORT]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
+};
+
+static const uint32_t nvme_cse_iocs_none[256];
+
+static const uint32_t nvme_cse_iocs_nvm[256] = {
+[NVME_CMD_FLUSH]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_WRITE]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
+};
+
 static void nvme_process_sq(void *opaque);
 
 static uint16_t nvme_cid(NvmeRequest *req)
@@ -1022,10 +1044,6 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
   req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
 
-if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_ADMIN_ONLY) {
-return NVME_INVALID_OPCODE | NVME_DNR;
-}
-
 if (!nvme_nsid_valid(n, nsid)) {
 return NVME_INVALID_NSID | NVME_DNR;
 }
@@ -1035,6 +1053,11 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
+if (!(req->ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
+trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
+return NVME_INVALID_OPCODE | NVME_DNR;
+}
+
 switch (req->cmd.opcode) {
 case NVME_CMD_FLUSH:
 return nvme_flush(n, re

[PATCH v10 12/12] hw/block/nvme: Document zoned parameters in usage text

2020-11-06 Thread Dmitry Fomichev
Added brief descriptions of the new device properties that are
now available to users to configure features of Zoned Namespace
Command Set in the emulator.

This patch is for documentation only, no functionality change.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme.c | 47 ++-
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index e82e3be821..6043f95116 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -9,7 +9,7 @@
  */
 
 /**
- * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
+ * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
  *
  *  https://nvmexpress.org/developers/nvme-specification/
  */
@@ -22,8 +22,9 @@
  *  [pmrdev=,] \
  *  max_ioqpairs=, \
  *  aerl=, aer_max_queued=, \
- *  mdts=
- *  -device nvme-ns,drive=,bus=bus_name,nsid=
+ *  mdts=,zoned.append_size_limit= \
+ *  -device nvme-ns,drive=,bus=,nsid=,\
+ *  zoned=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
@@ -41,14 +42,50 @@
  * ~~
  * - `aerl`
  *   The Asynchronous Event Request Limit (AERL). Indicates the maximum number
- *   of concurrently outstanding Asynchronous Event Request commands suppoert
+ *   of concurrently outstanding Asynchronous Event Request commands support
  *   by the controller. This is a 0's based value.
  *
  * - `aer_max_queued`
  *   This is the maximum number of events that the device will enqueue for
- *   completion when there are no oustanding AERs. When the maximum number of
+ *   completion when there are no outstanding AERs. When the maximum number of
  *   enqueued events are reached, subsequent events will be dropped.
  *
+ * - `zoned.append_size_limit`
+ *   The maximum I/O size in bytes that is allowed in Zone Append command.
+ *   The default is 128KiB. Since internally this this value is maintained as
+ *   ZASL = log2( / ), some values assigned
+ *   to this property may be rounded down and result in a lower maximum ZA
+ *   data size being in effect. By setting this property to 0, users can make
+ *   ZASL to be equal to MDTS. This property only affects zoned namespaces.
+ *
+ * Setting `zoned` to true selects Zoned Command Set at the namespace.
+ * In this case, the following namespace properties are available to configure
+ * zoned operation:
+ * zoned.zsze=
+ * The number may be followed by K, M, G as in kilo-, mega- or giga-.
+ *
+ * zoned.zcap=
+ * The value 0 (default) forces zone capacity to be the same as zone
+ * size. The value of this property may not exceed zone size.
+ *
+ * zoned.descr_ext_size=
+ * This value needs to be specified in 64B units. If it is zero,
+ * namespace(s) will not support zone descriptor extensions.
+ *
+ * zoned.max_active=
+ * The default value means there is no limit to the number of
+ * concurrently active zones.
+ *
+ * zoned.max_open=
+ * The default value means there is no limit to the number of
+ * concurrently open zones.
+ *
+ * zoned.offline_zones=
+ *
+ * zoned.rdonly_zones=
+ *
+ * zoned.cross_zone_read=
+ * Setting this property to true enables Read Across Zone Boundaries.
  */
 
 #include "qemu/osdep.h"
-- 
2.21.0




[PATCH v10 06/12] hw/block/nvme: Support allocated CNS command variants

2020-11-06 Thread Dmitry Fomichev
From: Niklas Cassel 

Many CNS commands have "allocated" command variants. These include
a namespace as long as it is allocated, that is a namespace is
included regardless if it is active (attached) or not.

While these commands are optional (they are mandatory for controllers
supporting the namespace attachment command), our QEMU implementation
is more complete by actually providing support for these CNS values.

However, since our QEMU model currently does not support the namespace
attachment command, these new allocated CNS commands will return the
same result as the active CNS command variants.

In NVMe, a namespace is active if it exists and is attached to the
controller.

Add a new Boolean namespace flag, "attached", to provide the most
basic namespace attachment support. The default value for this new
flag is true. Also, implement the logic in the new CNS values to
include/exclude namespaces based on this new property. The only thing
missing is hooking up the actual Namespace Attachment command opcode,
which will allow a user to toggle the "attached" flag per namespace.

The reason for not hooking up this command completely is because the
NVMe specification requires the namespace management command to be
supported if the namespace attachment command is supported.

Signed-off-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Keith Busch 
---
 hw/block/nvme-ns.h   |  1 +
 include/block/nvme.h | 20 +++
 hw/block/nvme-ns.c   |  1 +
 hw/block/nvme.c  | 46 +---
 4 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index d795e44bab..2d9cd29d07 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -31,6 +31,7 @@ typedef struct NvmeNamespace {
 int64_t  size;
 NvmeIdNs id_ns;
 const uint32_t *iocs;
+bool attached;
 uint8_t  csi;
 
 NvmeNamespaceParams params;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index af23514713..394db19022 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -806,14 +806,18 @@ typedef struct QEMU_PACKED NvmePSD {
 #define NVME_IDENTIFY_DATA_SIZE 4096
 
 enum NvmeIdCns {
-NVME_ID_CNS_NS= 0x00,
-NVME_ID_CNS_CTRL  = 0x01,
-NVME_ID_CNS_NS_ACTIVE_LIST= 0x02,
-NVME_ID_CNS_NS_DESCR_LIST = 0x03,
-NVME_ID_CNS_CS_NS = 0x05,
-NVME_ID_CNS_CS_CTRL   = 0x06,
-NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
-NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
+NVME_ID_CNS_NS= 0x00,
+NVME_ID_CNS_CTRL  = 0x01,
+NVME_ID_CNS_NS_ACTIVE_LIST= 0x02,
+NVME_ID_CNS_NS_DESCR_LIST = 0x03,
+NVME_ID_CNS_CS_NS = 0x05,
+NVME_ID_CNS_CS_CTRL   = 0x06,
+NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
+NVME_ID_CNS_NS_PRESENT_LIST   = 0x10,
+NVME_ID_CNS_NS_PRESENT= 0x11,
+NVME_ID_CNS_CS_NS_PRESENT_LIST= 0x1a,
+NVME_ID_CNS_CS_NS_PRESENT = 0x1b,
+NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
 };
 
 typedef struct QEMU_PACKED NvmeIdCtrl {
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index c0362426cc..e191ef9be0 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -42,6 +42,7 @@ static void nvme_ns_init(NvmeNamespace *ns)
 id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));
 
 ns->csi = NVME_CSI_NVM;
+ns->attached = true;
 
 /* no thin provisioning */
 id_ns->ncap = id_ns->nsze;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index bb82dd9975..7495cdb5ef 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1236,6 +1236,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, 
uint32_t buf_len,
 uint32_t trans_len;
 NvmeNamespace *ns;
 time_t current_ms;
+int i;
 
 if (off >= sizeof(smart)) {
 return NVME_INVALID_FIELD | NVME_DNR;
@@ -1246,10 +1247,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t 
rae, uint32_t buf_len,
 if (!ns) {
 return NVME_INVALID_NSID | NVME_DNR;
 }
-nvme_set_blk_stats(ns, );
 } else {
-int i;
-
 for (i = 1; i <= n->num_namespaces; i++) {
 ns = nvme_ns(n, i);
 if (!ns) {
@@ -1552,7 +1550,8 @@ static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req,
+ bool only_active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -1569,11 +1568,16 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeRequest *req)
 return nvme_rpt_empty_id_struct(n, req);
 }
 
+if (only_active && !ns->attached) {
+return nvme_rpt_empty_i

[PATCH v10 00/12] hw/block/nvme: Support Namespace Types and Zoned Namespace Command Set

2020-11-06 Thread Dmitry Fomichev
ments addressed in v2 of the series
   (see below).

v2 -> v3:

 - Moved nvme_fill_data() function to the NSTypes patch as it is
   now used there to output empty namespace identify structs.
 - Fixed typo in Maxim's email address.

v1 -> v2:

 - Rebased on top of qemu-nvme/next branch.
 - Incorporated feedback from Klaus and Alistair.
* Allow a subset of CSE log to be read, not the entire log
* Assign admin command entries in CSE log to ACS fields
* Set LPA bit 1 to indicate support of CSE log page
* Rename CC.CSS value CSS_ALL_NSTYPES (110b) to CSS_CSI
* Move the code to assign lbaf.ds to a separate patch
* Remove the change in firmware revision
* Change "driver" to "device" in comments and annotations
* Rename ZAMDS to ZASL
* Correct a few format expressions and some wording in
  trace event definitions
* Remove validation code to return NVME_CAP_EXCEEDED error
* Make ZASL to be equal to MDTS if "zone_append_size_limit"
  module parameter is not set
* Clean up nvme_zoned_init_ctrl() to make size calculations
  less confusing
* Avoid changing module parameters, use separate n/s variables
  if additional calculations are necessary to convert parameters
  to running values
* Use NVME_DEFAULT_ZONE_SIZE to assign the default zone size value
* Use default 0 for zone capacity meaning that zone capacity will
  be equal to zone size by default
* Issue warnings if user MAR/MOR values are too large and have
  to be adjusted
* Use unsigned values for MAR/MOR
 - Dropped "Simulate Zone Active excursions" patch.
   Excursion behavior may depend on the internal controller
   architecture and therefore be vendor-specific.
 - Dropped support for Zone Attributes and zoned AENs for now.
   These features can be added in a future series.
 - NS Types support is extended to handle active/inactive namespaces.
 - Update the write pointer after backing storage I/O completion, not
   before. This makes the emulation to run correctly in case of
   backing device failures.
 - Avoid division in the I/O path if the device zone size is
   a power of two (the most common case). Zone index then can be
   calculated by using bit shift.
 - A few reported bugs have been fixed.
 - Indentation in function definitions has been changed to make it
   the same as the rest of the code.


Zoned Namespace (ZNS) Command Set is a newly introduced command set
published by the NVM Express, Inc. organization as TP 4053. The main
design goals of ZNS are to provide hardware designers the means to
reduce NVMe controller complexity and to allow achieving a better I/O
latency and throughput. SSDs that implement this interface are
commonly known as ZNS SSDs.

This command set is implementing a zoned storage model, similarly to
ZAC/ZBC. As such, there is already support in Linux, allowing one to
perform the majority of tasks needed for managing ZNS SSDs.

The Zoned Namespace Command Set relies on another TP, known as
Namespace Types (NVMe TP 4056), which introduces support for having
multiple command sets per namespace.

Both ZNS and Namespace Types specifications can be downloaded by
visiting the following link -

https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs.zip

This patch series adds Namespace Types support and zoned namespace
emulation capability to the existing NVMe PCI device.

Based-on: <20201104102248.32168-1-...@irrelevant.dk>

Dmitry Fomichev (10):
  hw/block/nvme: Add Commands Supported and Effects log
  hw/block/nvme: Generate namespace UUIDs
  hw/block/nvme: Separate read and write handlers
  hw/block/nvme: Merge nvme_write_zeroes() with nvme_write()
  block/nvme: Make ZNS-related definitions
  hw/block/nvme: Support Zoned Namespace Command Set
  hw/block/nvme: Introduce max active and open zone limits
  hw/block/nvme: Support Zone Descriptor Extensions
  hw/block/nvme: Add injection of Offline/Read-Only zones
  hw/block/nvme: Document zoned parameters in usage text

Niklas Cassel (2):
  hw/block/nvme: Add support for Namespace Types
  hw/block/nvme: Support allocated CNS command variants

 hw/block/nvme-ns.h|  109 +++
 hw/block/nvme.h   |8 +
 include/block/nvme.h  |  205 +-
 hw/block/nvme-ns.c|  276 +++
 hw/block/nvme.c   | 1591 ++---
 hw/block/trace-events |   32 +-
 6 files changed, 2096 insertions(+), 125 deletions(-)

-- 
2.21.0




[PATCH v10 03/12] hw/block/nvme: Separate read and write handlers

2020-11-06 Thread Dmitry Fomichev
With ZNS support in place, the majority of code in nvme_rw() has
become read- or write-specific. Move these parts to two separate
handlers, nvme_read() and nvme_write() to make the code more
readable and to remove multiple is_write checks that so far existed
in the i/o path.

This is a refactoring patch, no change in functionality.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
Acked-by: Klaus Jensen 
---
 hw/block/nvme.c   | 91 ++-
 hw/block/trace-events |  3 +-
 2 files changed, 67 insertions(+), 27 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index ed3f38f01d..770e42a066 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -953,6 +953,54 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
+static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
+NvmeNamespace *ns = req->ns;
+uint64_t slba = le64_to_cpu(rw->slba);
+uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
+uint64_t data_size = nvme_l2b(ns, nlb);
+uint64_t data_offset;
+BlockBackend *blk = ns->blkconf.blk;
+uint16_t status;
+
+trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, data_size, slba);
+
+status = nvme_check_mdts(n, data_size);
+if (status) {
+trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+goto invalid;
+}
+
+status = nvme_check_bounds(n, ns, slba, nlb);
+if (status) {
+trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
+goto invalid;
+}
+
+status = nvme_map_dptr(n, data_size, req);
+if (status) {
+goto invalid;
+}
+
+data_offset = nvme_l2b(ns, slba);
+
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_READ);
+if (req->qsg.sg) {
+req->aiocb = dma_blk_read(blk, >qsg, data_offset,
+  BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+} else {
+req->aiocb = blk_aio_preadv(blk, data_offset, >iov, 0,
+nvme_rw_cb, req);
+}
+return NVME_NO_COMPLETE;
+
+invalid:
+block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
+return status | NVME_DNR;
+}
+
 static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
@@ -978,22 +1026,19 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
-static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
 NvmeNamespace *ns = req->ns;
-uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
 uint64_t slba = le64_to_cpu(rw->slba);
-
+uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
 uint64_t data_size = nvme_l2b(ns, nlb);
-uint64_t data_offset = nvme_l2b(ns, slba);
-enum BlockAcctType acct = req->cmd.opcode == NVME_CMD_WRITE ?
-BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
+uint64_t data_offset;
 BlockBackend *blk = ns->blkconf.blk;
 uint16_t status;
 
-trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode),
-  nvme_nsid(ns), nlb, data_size, slba);
+trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
+ nvme_nsid(ns), nlb, data_size, slba);
 
 status = nvme_check_mdts(n, data_size);
 if (status) {
@@ -1012,29 +1057,22 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
 goto invalid;
 }
 
-block_acct_start(blk_get_stats(blk), >acct, data_size, acct);
+data_offset = nvme_l2b(ns, slba);
+
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_WRITE);
 if (req->qsg.sg) {
-if (acct == BLOCK_ACCT_WRITE) {
-req->aiocb = dma_blk_write(blk, >qsg, data_offset,
-   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
-} else {
-req->aiocb = dma_blk_read(blk, >qsg, data_offset,
-  BDRV_SECTOR_SIZE, nvme_rw_cb, req);
-}
+req->aiocb = dma_blk_write(blk, >qsg, data_offset,
+   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
 } else {
-if (acct == BLOCK_ACCT_WRITE) {
-req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
- nvme_rw_cb, req);
-} else {
-req->aiocb = blk_aio_preadv(blk, data_offset, >iov, 0,
-nvme_rw_cb, req);
-}
+req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
+ nvme_rw_cb, req);
 }
 return NVME_NO_COMPLETE;
 
 invalid:
-block_acct_invalid(blk_get_stats(ns->blkconf.blk), acct)

RE: [PATCH v9 08/12] hw/block/nvme: Support Zoned Namespace Command Set

2020-11-06 Thread Dmitry Fomichev
> -Original Message-
> From: Niklas Cassel 
> Sent: Friday, November 6, 2020 6:59 AM
> To: Dmitry Fomichev 
> Cc: Keith Busch ; Klaus Jensen
> ; Kevin Wolf ; Philippe
> Mathieu-Daudé ; Max Reitz ;
> Maxim Levitsky ; Fam Zheng ;
> Alistair Francis ; Matias Bjorling
> ; Damien Le Moal ;
> qemu-bl...@nongnu.org; qemu-devel@nongnu.org
> Subject: Re: [PATCH v9 08/12] hw/block/nvme: Support Zoned Namespace
> Command Set
> 
> On Thu, Nov 05, 2020 at 11:53:38AM +0900, Dmitry Fomichev wrote:
> > The emulation code has been changed to advertise NVM Command Set
> when
> > "zoned" device property is not set (default) and Zoned Namespace
> > Command Set otherwise.
> >
> > Define values and structures that are needed to support Zoned
> > Namespace Command Set (NVMe TP 4053) in PCI NVMe controller
> emulator.
> > Define trace events where needed in newly introduced code.
> >
> > In order to improve scalability, all open, closed and full zones
> > are organized in separate linked lists. Consequently, almost all
> > zone operations don't require scanning of the entire zone array
> > (which potentially can be quite large) - it is only necessary to
> > enumerate one or more zone lists.
> >
> > Handlers for three new NVMe commands introduced in Zoned Namespace
> > Command Set specification are added, namely for Zone Management
> > Receive, Zone Management Send and Zone Append.
> >
> > Device initialization code has been extended to create a proper
> > configuration for zoned operation using device properties.
> >
> > Read/Write command handler is modified to only allow writes at the
> > write pointer if the namespace is zoned. For Zone Append command,
> > writes implicitly happen at the write pointer and the starting write
> > pointer value is returned as the result of the command. Write Zeroes
> > handler is modified to add zoned checks that are identical to those
> > done as a part of Write flow.
> >
> > Subsequent commits in this series add ZDE support and checks for
> > active and open zone limits.
> >
> > Signed-off-by: Niklas Cassel 
> > Signed-off-by: Hans Holmberg 
> > Signed-off-by: Ajay Joshi 
> > Signed-off-by: Chaitanya Kulkarni 
> > Signed-off-by: Matias Bjorling 
> > Signed-off-by: Aravind Ramesh 
> > Signed-off-by: Shin'ichiro Kawasaki 
> > Signed-off-by: Adam Manzanares 
> > Signed-off-by: Dmitry Fomichev 
> > Reviewed-by: Niklas Cassel 
> > ---
> >  hw/block/nvme-ns.h|  54 +++
> >  hw/block/nvme.h   |   8 +
> >  hw/block/nvme-ns.c| 173 
> >  hw/block/nvme.c   | 971
> +-
> >  hw/block/trace-events |  18 +-
> >  5 files changed, 1209 insertions(+), 15 deletions(-)
> >
> 
> (snip)
> 
> > +static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest
> *req)
> > +{
> > +NvmeCmd *cmd = (NvmeCmd *)>cmd;
> > +NvmeNamespace *ns = req->ns;
> > +/* cdw12 is zero-based number of dwords to return. Convert to bytes
> */
> > +uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
> > +uint32_t dw13 = le32_to_cpu(cmd->cdw13);
> > +uint32_t zone_idx, zra, zrasf, partial;
> > +uint64_t max_zones, nr_zones = 0;
> > +uint16_t ret;
> > +uint64_t slba;
> > +NvmeZoneDescr *z;
> > +NvmeZone *zs;
> > +NvmeZoneReportHeader *header;
> > +void *buf, *buf_p;
> > +size_t zone_entry_sz;
> > +
> > +req->status = NVME_SUCCESS;
> > +
> > +ret = nvme_get_mgmt_zone_slba_idx(ns, cmd, , _idx);
> > +if (ret) {
> > +return ret;
> > +}
> > +
> > +zra = dw13 & 0xff;
> > +if (zra != NVME_ZONE_REPORT) {
> > +return NVME_INVALID_FIELD | NVME_DNR;
> > +}
> > +
> > +zrasf = (dw13 >> 8) & 0xff;
> > +if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
> > +return NVME_INVALID_FIELD | NVME_DNR;
> > +}
> > +
> > +if (data_size < sizeof(NvmeZoneReportHeader)) {
> > +return NVME_INVALID_FIELD | NVME_DNR;
> > +}
> > +
> > +ret = nvme_map_dptr(n, data_size, req);
> 
> nvme_map_dptr() call was not here in v8 patch set.
> 
> On v7 I commented that you were missing a call to nvme_check_mdts().
> I think you still need to call nvme_check_mdts in order to verify
> that data_size < mdts, no?

Ugh, I've added nvme_map_dptr() instead of nvme_check_mdts() :o
Will send the corrected version shortly...

Cheers,
DF

&

[PATCH v9 12/12] hw/block/nvme: Document zoned parameters in usage text

2020-11-04 Thread Dmitry Fomichev
Added brief descriptions of the new device properties that are
now available to users to configure features of Zoned Namespace
Command Set in the emulator.

This patch is for documentation only, no functionality change.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme.c | 47 ++-
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 0db51995cc..8901321317 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -9,7 +9,7 @@
  */
 
 /**
- * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
+ * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
  *
  *  https://nvmexpress.org/developers/nvme-specification/
  */
@@ -22,8 +22,9 @@
  *  [pmrdev=,] \
  *  max_ioqpairs=, \
  *  aerl=, aer_max_queued=, \
- *  mdts=
- *  -device nvme-ns,drive=,bus=bus_name,nsid=
+ *  mdts=,zoned.append_size_limit= \
+ *  -device nvme-ns,drive=,bus=,nsid=,\
+ *  zoned=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
@@ -41,14 +42,50 @@
  * ~~
  * - `aerl`
  *   The Asynchronous Event Request Limit (AERL). Indicates the maximum number
- *   of concurrently outstanding Asynchronous Event Request commands suppoert
+ *   of concurrently outstanding Asynchronous Event Request commands support
  *   by the controller. This is a 0's based value.
  *
  * - `aer_max_queued`
  *   This is the maximum number of events that the device will enqueue for
- *   completion when there are no oustanding AERs. When the maximum number of
+ *   completion when there are no outstanding AERs. When the maximum number of
  *   enqueued events are reached, subsequent events will be dropped.
  *
+ * - `zoned.append_size_limit`
+ *   The maximum I/O size in bytes that is allowed in Zone Append command.
+ *   The default is 128KiB. Since internally this this value is maintained as
+ *   ZASL = log2( / ), some values assigned
+ *   to this property may be rounded down and result in a lower maximum ZA
+ *   data size being in effect. By setting this property to 0, users can make
+ *   ZASL to be equal to MDTS. This property only affects zoned namespaces.
+ *
+ * Setting `zoned` to true selects Zoned Command Set at the namespace.
+ * In this case, the following namespace properties are available to configure
+ * zoned operation:
+ * zoned.zsze=
+ * The number may be followed by K, M, G as in kilo-, mega- or giga-.
+ *
+ * zoned.zcap=
+ * The value 0 (default) forces zone capacity to be the same as zone
+ * size. The value of this property may not exceed zone size.
+ *
+ * zoned.descr_ext_size=
+ * This value needs to be specified in 64B units. If it is zero,
+ * namespace(s) will not support zone descriptor extensions.
+ *
+ * zoned.max_active=
+ * The default value means there is no limit to the number of
+ * concurrently active zones.
+ *
+ * zoned.max_open=
+ * The default value means there is no limit to the number of
+ * concurrently open zones.
+ *
+ * zoned.offline_zones=
+ *
+ * zoned.rdonly_zones=
+ *
+ * zoned.cross_zone_read=
+ * Setting this property to true enables Read Across Zone Boundaries.
  */
 
 #include "qemu/osdep.h"
-- 
2.21.0




[PATCH v9 11/12] hw/block/nvme: Add injection of Offline/Read-Only zones

2020-11-04 Thread Dmitry Fomichev
ZNS specification defines two zone conditions for the zones that no
longer can function properly, possibly because of flash wear or other
internal fault. It is useful to be able to "inject" a small number of
such zones for testing purposes.

This commit defines two optional device properties, "offline_zones"
and "rdonly_zones". Users can assign non-zero values to these variables
to specify the number of zones to be initialized as Offline or
Read-Only. The actual number of injected zones may be smaller than the
requested amount - Read-Only and Offline counts are expected to be much
smaller than the total number of zones on a drive.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h |  2 ++
 hw/block/nvme-ns.c | 52 ++
 2 files changed, 54 insertions(+)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 50a6a0e1ac..b30478e5d7 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -36,6 +36,8 @@ typedef struct NvmeNamespaceParams {
 uint32_t max_active_zones;
 uint32_t max_open_zones;
 uint32_t zd_extension_size;
+uint32_t nr_offline_zones;
+uint32_t nr_rdonly_zones;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 85dc73cf06..5e4a6705cd 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -21,6 +21,7 @@
 #include "sysemu/sysemu.h"
 #include "sysemu/block-backend.h"
 #include "qapi/error.h"
+#include "crypto/random.h"
 
 #include "hw/qdev-properties.h"
 #include "hw/qdev-core.h"
@@ -145,6 +146,20 @@ static int nvme_calc_zone_geometry(NvmeNamespace *ns, 
Error **errp)
 }
 }
 
+if (ns->params.max_open_zones < nz) {
+if (ns->params.nr_offline_zones > nz - ns->params.max_open_zones) {
+error_setg(errp, "offline_zones value %u is too large",
+ns->params.nr_offline_zones);
+return -1;
+}
+if (ns->params.nr_rdonly_zones >
+nz - ns->params.max_open_zones - ns->params.nr_offline_zones) {
+error_setg(errp, "rdonly_zones value %u is too large",
+ns->params.nr_rdonly_zones);
+return -1;
+}
+}
+
 return 0;
 }
 
@@ -153,7 +168,9 @@ static void nvme_init_zone_state(NvmeNamespace *ns)
 uint64_t start = 0, zone_size = ns->zone_size;
 uint64_t capacity = ns->num_zones * zone_size;
 NvmeZone *zone;
+uint32_t rnd;
 int i;
+uint16_t zs;
 
 ns->zone_array = g_malloc0(ns->zone_array_size);
 if (ns->params.zd_extension_size) {
@@ -180,6 +197,37 @@ static void nvme_init_zone_state(NvmeNamespace *ns)
 zone->w_ptr = start;
 start += zone_size;
 }
+
+/* If required, make some zones Offline or Read Only */
+
+for (i = 0; i < ns->params.nr_offline_zones; i++) {
+do {
+qcrypto_random_bytes(, sizeof(rnd), NULL);
+rnd %= ns->num_zones;
+} while (rnd < ns->params.max_open_zones);
+zone = >zone_array[rnd];
+zs = nvme_get_zone_state(zone);
+if (zs != NVME_ZONE_STATE_OFFLINE) {
+nvme_set_zone_state(zone, NVME_ZONE_STATE_OFFLINE);
+} else {
+i--;
+}
+}
+
+for (i = 0; i < ns->params.nr_rdonly_zones; i++) {
+do {
+qcrypto_random_bytes(, sizeof(rnd), NULL);
+rnd %= ns->num_zones;
+} while (rnd < ns->params.max_open_zones);
+zone = >zone_array[rnd];
+zs = nvme_get_zone_state(zone);
+if (zs != NVME_ZONE_STATE_OFFLINE &&
+zs != NVME_ZONE_STATE_READ_ONLY) {
+nvme_set_zone_state(zone, NVME_ZONE_STATE_READ_ONLY);
+} else {
+i--;
+}
+}
 }
 
 static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace *ns, int lba_index,
@@ -353,6 +401,10 @@ static Property nvme_ns_props[] = {
params.max_open_zones, 0),
 DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
params.zd_extension_size, 0),
+DEFINE_PROP_UINT32("zoned.offline_zones", NvmeNamespace,
+   params.nr_offline_zones, 0),
+DEFINE_PROP_UINT32("zoned.rdonly_zones", NvmeNamespace,
+   params.nr_rdonly_zones, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
-- 
2.21.0




[PATCH v9 06/12] hw/block/nvme: Support allocated CNS command variants

2020-11-04 Thread Dmitry Fomichev
From: Niklas Cassel 

Many CNS commands have "allocated" command variants. These include
a namespace as long as it is allocated, that is a namespace is
included regardless if it is active (attached) or not.

While these commands are optional (they are mandatory for controllers
supporting the namespace attachment command), our QEMU implementation
is more complete by actually providing support for these CNS values.

However, since our QEMU model currently does not support the namespace
attachment command, these new allocated CNS commands will return the
same result as the active CNS command variants.

In NVMe, a namespace is active if it exists and is attached to the
controller.

Add a new Boolean namespace flag, "attached", to provide the most
basic namespace attachment support. The default value for this new
flag is true. Also, implement the logic in the new CNS values to
include/exclude namespaces based on this new property. The only thing
missing is hooking up the actual Namespace Attachment command opcode,
which will allow a user to toggle the "attached" flag per namespace.

The reason for not hooking up this command completely is because the
NVMe specification requires the namespace management command to be
supported if the namespace attachment command is supported.

Signed-off-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Keith Busch 
---
 hw/block/nvme-ns.h   |  1 +
 include/block/nvme.h | 20 +++
 hw/block/nvme-ns.c   |  1 +
 hw/block/nvme.c  | 46 +---
 4 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index d795e44bab..2d9cd29d07 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -31,6 +31,7 @@ typedef struct NvmeNamespace {
 int64_t  size;
 NvmeIdNs id_ns;
 const uint32_t *iocs;
+bool attached;
 uint8_t  csi;
 
 NvmeNamespaceParams params;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index af23514713..394db19022 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -806,14 +806,18 @@ typedef struct QEMU_PACKED NvmePSD {
 #define NVME_IDENTIFY_DATA_SIZE 4096
 
 enum NvmeIdCns {
-NVME_ID_CNS_NS= 0x00,
-NVME_ID_CNS_CTRL  = 0x01,
-NVME_ID_CNS_NS_ACTIVE_LIST= 0x02,
-NVME_ID_CNS_NS_DESCR_LIST = 0x03,
-NVME_ID_CNS_CS_NS = 0x05,
-NVME_ID_CNS_CS_CTRL   = 0x06,
-NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
-NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
+NVME_ID_CNS_NS= 0x00,
+NVME_ID_CNS_CTRL  = 0x01,
+NVME_ID_CNS_NS_ACTIVE_LIST= 0x02,
+NVME_ID_CNS_NS_DESCR_LIST = 0x03,
+NVME_ID_CNS_CS_NS = 0x05,
+NVME_ID_CNS_CS_CTRL   = 0x06,
+NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
+NVME_ID_CNS_NS_PRESENT_LIST   = 0x10,
+NVME_ID_CNS_NS_PRESENT= 0x11,
+NVME_ID_CNS_CS_NS_PRESENT_LIST= 0x1a,
+NVME_ID_CNS_CS_NS_PRESENT = 0x1b,
+NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
 };
 
 typedef struct QEMU_PACKED NvmeIdCtrl {
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index c0362426cc..e191ef9be0 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -42,6 +42,7 @@ static void nvme_ns_init(NvmeNamespace *ns)
 id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));
 
 ns->csi = NVME_CSI_NVM;
+ns->attached = true;
 
 /* no thin provisioning */
 id_ns->ncap = id_ns->nsze;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index bb82dd9975..7495cdb5ef 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1236,6 +1236,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, 
uint32_t buf_len,
 uint32_t trans_len;
 NvmeNamespace *ns;
 time_t current_ms;
+int i;
 
 if (off >= sizeof(smart)) {
 return NVME_INVALID_FIELD | NVME_DNR;
@@ -1246,10 +1247,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t 
rae, uint32_t buf_len,
 if (!ns) {
 return NVME_INVALID_NSID | NVME_DNR;
 }
-nvme_set_blk_stats(ns, );
 } else {
-int i;
-
 for (i = 1; i <= n->num_namespaces; i++) {
 ns = nvme_ns(n, i);
 if (!ns) {
@@ -1552,7 +1550,8 @@ static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req,
+ bool only_active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -1569,11 +1568,16 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeRequest *req)
 return nvme_rpt_empty_id_struct(n, req);
 }
 
+if (only_active && !ns->attached) {
+return nvme_rpt_empty_i

[PATCH v9 08/12] hw/block/nvme: Support Zoned Namespace Command Set

2020-11-04 Thread Dmitry Fomichev
The emulation code has been changed to advertise NVM Command Set when
"zoned" device property is not set (default) and Zoned Namespace
Command Set otherwise.

Define values and structures that are needed to support Zoned
Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator.
Define trace events where needed in newly introduced code.

In order to improve scalability, all open, closed and full zones
are organized in separate linked lists. Consequently, almost all
zone operations don't require scanning of the entire zone array
(which potentially can be quite large) - it is only necessary to
enumerate one or more zone lists.

Handlers for three new NVMe commands introduced in Zoned Namespace
Command Set specification are added, namely for Zone Management
Receive, Zone Management Send and Zone Append.

Device initialization code has been extended to create a proper
configuration for zoned operation using device properties.

Read/Write command handler is modified to only allow writes at the
write pointer if the namespace is zoned. For Zone Append command,
writes implicitly happen at the write pointer and the starting write
pointer value is returned as the result of the command. Write Zeroes
handler is modified to add zoned checks that are identical to those
done as a part of Write flow.

Subsequent commits in this series add ZDE support and checks for
active and open zone limits.

Signed-off-by: Niklas Cassel 
Signed-off-by: Hans Holmberg 
Signed-off-by: Ajay Joshi 
Signed-off-by: Chaitanya Kulkarni 
Signed-off-by: Matias Bjorling 
Signed-off-by: Aravind Ramesh 
Signed-off-by: Shin'ichiro Kawasaki 
Signed-off-by: Adam Manzanares 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h|  54 +++
 hw/block/nvme.h   |   8 +
 hw/block/nvme-ns.c| 173 
 hw/block/nvme.c   | 971 +-
 hw/block/trace-events |  18 +-
 5 files changed, 1209 insertions(+), 15 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 2d9cd29d07..d2631ff5a3 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -19,9 +19,20 @@
 #define NVME_NS(obj) \
 OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
 
+typedef struct NvmeZone {
+NvmeZoneDescr   d;
+uint64_tw_ptr;
+QTAILQ_ENTRY(NvmeZone) entry;
+} NvmeZone;
+
 typedef struct NvmeNamespaceParams {
 uint32_t nsid;
 QemuUUID uuid;
+
+bool zoned;
+bool cross_zone_read;
+uint64_t zone_size_bs;
+uint64_t zone_cap_bs;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -34,6 +45,18 @@ typedef struct NvmeNamespace {
 bool attached;
 uint8_t  csi;
 
+NvmeIdNsZoned   *id_ns_zoned;
+NvmeZone*zone_array;
+QTAILQ_HEAD(, NvmeZone) exp_open_zones;
+QTAILQ_HEAD(, NvmeZone) imp_open_zones;
+QTAILQ_HEAD(, NvmeZone) closed_zones;
+QTAILQ_HEAD(, NvmeZone) full_zones;
+uint32_tnum_zones;
+uint64_tzone_size;
+uint64_tzone_capacity;
+uint64_tzone_array_size;
+uint32_tzone_size_log2;
+
 NvmeNamespaceParams params;
 } NvmeNamespace;
 
@@ -71,8 +94,39 @@ static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t 
lba)
 
 typedef struct NvmeCtrl NvmeCtrl;
 
+static inline uint8_t nvme_get_zone_state(NvmeZone *zone)
+{
+return zone->d.zs >> 4;
+}
+
+static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState 
state)
+{
+zone->d.zs = state << 4;
+}
+
+static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone *zone)
+{
+return zone->d.zslba + ns->zone_size;
+}
+
+static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
+{
+return zone->d.zslba + zone->d.zcap;
+}
+
+static inline bool nvme_wp_is_valid(NvmeZone *zone)
+{
+uint8_t st = nvme_get_zone_state(zone);
+
+return st != NVME_ZONE_STATE_FULL &&
+   st != NVME_ZONE_STATE_READ_ONLY &&
+   st != NVME_ZONE_STATE_OFFLINE;
+}
+
 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_flush(NvmeNamespace *ns);
+void nvme_ns_shutdown(NvmeNamespace *ns);
+void nvme_ns_cleanup(NvmeNamespace *ns);
 
 #endif /* NVME_NS_H */
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index e080a2318a..4cb0615128 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -6,6 +6,9 @@
 
 #define NVME_MAX_NAMESPACES 256
 
+#define NVME_DEFAULT_ZONE_SIZE   (128 * MiB)
+#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
+
 typedef struct NvmeParams {
 char *serial;
 uint32_t num_queues; /* deprecated since 5.1 */
@@ -16,6 +19,7 @@ typedef struct NvmeParams {
 uint32_t aer_max_queued;
 uint8_t  mdts;
 bool use_intel_id;
+uint32_t zasl_bs;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
@@ -28,6 +32,8 @@ typedef struct NvmeRequest {
 struct NvmeNamespace*ns;
 BlockAIOCB 

[PATCH v9 10/12] hw/block/nvme: Support Zone Descriptor Extensions

2020-11-04 Thread Dmitry Fomichev
Zone Descriptor Extension is a label that can be assigned to a zone.
It can be set to an Empty zone and it stays assigned until the zone
is reset.

This commit adds a new optional module property,
"zoned.descr_ext_size". Its value must be a multiple of 64 bytes.
If this value is non-zero, it becomes possible to assign extensions
of that size to any Empty zones. The default value for this property
is 0, therefore setting extensions is disabled by default.

Signed-off-by: Hans Holmberg 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Klaus Jensen 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h|  8 +++
 hw/block/nvme-ns.c| 25 ++--
 hw/block/nvme.c   | 54 +--
 hw/block/trace-events |  2 ++
 4 files changed, 85 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 421bab0a57..50a6a0e1ac 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -35,6 +35,7 @@ typedef struct NvmeNamespaceParams {
 uint64_t zone_cap_bs;
 uint32_t max_active_zones;
 uint32_t max_open_zones;
+uint32_t zd_extension_size;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -58,6 +59,7 @@ typedef struct NvmeNamespace {
 uint64_tzone_capacity;
 uint64_tzone_array_size;
 uint32_tzone_size_log2;
+uint8_t *zd_extensions;
 int32_t nr_open_zones;
 int32_t nr_active_zones;
 
@@ -127,6 +129,12 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone)
st != NVME_ZONE_STATE_OFFLINE;
 }
 
+static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns,
+ uint32_t zone_idx)
+{
+return >zd_extensions[zone_idx * ns->params.zd_extension_size];
+}
+
 static inline void nvme_aor_inc_open(NvmeNamespace *ns)
 {
 assert(ns->nr_open_zones >= 0);
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 2e45838c15..85dc73cf06 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -133,6 +133,18 @@ static int nvme_calc_zone_geometry(NvmeNamespace *ns, 
Error **errp)
 return -1;
 }
 
+if (ns->params.zd_extension_size) {
+if (ns->params.zd_extension_size & 0x3f) {
+error_setg(errp,
+"zone descriptor extension size must be a multiple of 64B");
+return -1;
+}
+if ((ns->params.zd_extension_size >> 6) > 0xff) {
+error_setg(errp, "zone descriptor extension size is too large");
+return -1;
+}
+}
+
 return 0;
 }
 
@@ -144,6 +156,10 @@ static void nvme_init_zone_state(NvmeNamespace *ns)
 int i;
 
 ns->zone_array = g_malloc0(ns->zone_array_size);
+if (ns->params.zd_extension_size) {
+ns->zd_extensions = g_malloc0(ns->params.zd_extension_size *
+  ns->num_zones);
+}
 
 QTAILQ_INIT(>exp_open_zones);
 QTAILQ_INIT(>imp_open_zones);
@@ -186,7 +202,8 @@ static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace 
*ns, int lba_index,
 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
 
 id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size);
-id_ns_z->lbafe[lba_index].zdes = 0;
+id_ns_z->lbafe[lba_index].zdes =
+ns->params.zd_extension_size >> 6; /* Units of 64B */
 
 ns->csi = NVME_CSI_ZONED;
 ns->id_ns.nsze = cpu_to_le64(ns->zone_size * ns->num_zones);
@@ -204,7 +221,8 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone 
*zone)
 
 zone->w_ptr = zone->d.wp;
 state = nvme_get_zone_state(zone);
-if (zone->d.wp != zone->d.zslba) {
+if (zone->d.wp != zone->d.zslba ||
+(zone->d.za & NVME_ZA_ZD_EXT_VALID)) {
 if (state != NVME_ZONE_STATE_CLOSED) {
 trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
 nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
@@ -301,6 +319,7 @@ void nvme_ns_cleanup(NvmeNamespace *ns)
 if (ns->params.zoned) {
 g_free(ns->id_ns_zoned);
 g_free(ns->zone_array);
+g_free(ns->zd_extensions);
 }
 }
 
@@ -332,6 +351,8 @@ static Property nvme_ns_props[] = {
params.max_active_zones, 0),
 DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace,
params.max_open_zones, 0),
+DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
+   params.zd_extension_size, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index cbfd58b7c1..0db51995cc 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1703,6 +1703,26 @@ static uint16_t nvme_offline_zone(NvmeNamespace *ns, 
NvmeZone *zone,
 return NVME_ZONE_INVAL_TRANSITION;
 }
 
+static uint16_t nvme_set_zd_ext(NvmeNamespace *n

[PATCH v9 09/12] hw/block/nvme: Introduce max active and open zone limits

2020-11-04 Thread Dmitry Fomichev
Add two module properties, "zoned.max_active" and "zoned.max_open"
to control the maximum number of zones that can be active or open.
Once these variables are set to non-default values, these limits are
checked during I/O and Too Many Active or Too Many Open command status
is returned if they are exceeded.

Signed-off-by: Hans Holmberg 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h| 41 +++
 hw/block/nvme-ns.c| 30 +-
 hw/block/nvme.c   | 94 +++
 hw/block/trace-events |  2 +
 4 files changed, 165 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index d2631ff5a3..421bab0a57 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -33,6 +33,8 @@ typedef struct NvmeNamespaceParams {
 bool cross_zone_read;
 uint64_t zone_size_bs;
 uint64_t zone_cap_bs;
+uint32_t max_active_zones;
+uint32_t max_open_zones;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -56,6 +58,8 @@ typedef struct NvmeNamespace {
 uint64_tzone_capacity;
 uint64_tzone_array_size;
 uint32_tzone_size_log2;
+int32_t nr_open_zones;
+int32_t nr_active_zones;
 
 NvmeNamespaceParams params;
 } NvmeNamespace;
@@ -123,6 +127,43 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone)
st != NVME_ZONE_STATE_OFFLINE;
 }
 
+static inline void nvme_aor_inc_open(NvmeNamespace *ns)
+{
+assert(ns->nr_open_zones >= 0);
+if (ns->params.max_open_zones) {
+ns->nr_open_zones++;
+assert(ns->nr_open_zones <= ns->params.max_open_zones);
+}
+}
+
+static inline void nvme_aor_dec_open(NvmeNamespace *ns)
+{
+if (ns->params.max_open_zones) {
+assert(ns->nr_open_zones > 0);
+ns->nr_open_zones--;
+}
+assert(ns->nr_open_zones >= 0);
+}
+
+static inline void nvme_aor_inc_active(NvmeNamespace *ns)
+{
+assert(ns->nr_active_zones >= 0);
+if (ns->params.max_active_zones) {
+ns->nr_active_zones++;
+assert(ns->nr_active_zones <= ns->params.max_active_zones);
+}
+}
+
+static inline void nvme_aor_dec_active(NvmeNamespace *ns)
+{
+if (ns->params.max_active_zones) {
+assert(ns->nr_active_zones > 0);
+ns->nr_active_zones--;
+assert(ns->nr_active_zones >= ns->nr_open_zones);
+}
+assert(ns->nr_active_zones >= 0);
+}
+
 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_flush(NvmeNamespace *ns);
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index e6db7f7d3b..2e45838c15 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -119,6 +119,20 @@ static int nvme_calc_zone_geometry(NvmeNamespace *ns, 
Error **errp)
 ns->zone_size_log2 = 63 - clz64(ns->zone_size);
 }
 
+/* Make sure that the values of all ZNS properties are sane */
+if (ns->params.max_open_zones > nz) {
+error_setg(errp,
+   "max_open_zones value %u exceeds the number of zones %u",
+   ns->params.max_open_zones, nz);
+return -1;
+}
+if (ns->params.max_active_zones > nz) {
+error_setg(errp,
+   "max_active_zones value %u exceeds the number of zones %u",
+   ns->params.max_active_zones, nz);
+return -1;
+}
+
 return 0;
 }
 
@@ -166,8 +180,8 @@ static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace 
*ns, int lba_index,
 id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
 
 /* MAR/MOR are zeroes-based, 0x means no limit */
-id_ns_z->mar = 0x;
-id_ns_z->mor = 0x;
+id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1);
+id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1);
 id_ns_z->zoc = 0;
 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
 
@@ -195,6 +209,7 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone 
*zone)
 trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
 nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
 }
+nvme_aor_inc_active(ns);
 QTAILQ_INSERT_HEAD(>closed_zones, zone, entry);
 } else {
 trace_pci_nvme_clear_ns_reset(state, zone->d.zslba);
@@ -211,16 +226,23 @@ static void nvme_zoned_ns_shutdown(NvmeNamespace *ns)
 
 QTAILQ_FOREACH_SAFE(zone, >closed_zones, entry, next) {
 QTAILQ_REMOVE(>closed_zones, zone, entry);
+nvme_aor_dec_active(ns);
 nvme_clear_zone(ns, zone);
 }
 QTAILQ_FOREACH_SAFE(zone, >imp_open_zones, entry, next) {
 QTAILQ_REMOVE(>imp_open_zones, zone, entry);
+nvme_aor_dec_open(ns);
+nvme_aor_dec_ac

[PATCH v9 07/12] block/nvme: Make ZNS-related definitions

2020-11-04 Thread Dmitry Fomichev
Define values and structures that are needed to support Zoned
Namespace Command Set (NVMe TP 4053).

Signed-off-by: Dmitry Fomichev 
---
 include/block/nvme.h | 114 ++-
 1 file changed, 113 insertions(+), 1 deletion(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index 394db19022..752623b4f9 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -489,6 +489,9 @@ enum NvmeIoCommands {
 NVME_CMD_COMPARE= 0x05,
 NVME_CMD_WRITE_ZEROES   = 0x08,
 NVME_CMD_DSM= 0x09,
+NVME_CMD_ZONE_MGMT_SEND = 0x79,
+NVME_CMD_ZONE_MGMT_RECV = 0x7a,
+NVME_CMD_ZONE_APPEND= 0x7d,
 };
 
 typedef struct QEMU_PACKED NvmeDeleteQ {
@@ -648,9 +651,13 @@ typedef struct QEMU_PACKED NvmeAerResult {
 uint8_t resv;
 } NvmeAerResult;
 
+typedef struct QEMU_PACKED NvmeZonedResult {
+uint64_t slba;
+} NvmeZonedResult;
+
 typedef struct QEMU_PACKED NvmeCqe {
 uint32_tresult;
-uint32_trsvd;
+uint32_tdw1;
 uint16_tsq_head;
 uint16_tsq_id;
 uint16_tcid;
@@ -679,6 +686,7 @@ enum NvmeStatusCodes {
 NVME_INVALID_USE_OF_CMB = 0x0012,
 NVME_INVALID_PRP_OFFSET = 0x0013,
 NVME_CMD_SET_CMB_REJECTED   = 0x002b,
+NVME_INVALID_CMD_SET= 0x002c,
 NVME_LBA_RANGE  = 0x0080,
 NVME_CAP_EXCEEDED   = 0x0081,
 NVME_NS_NOT_READY   = 0x0082,
@@ -703,6 +711,14 @@ enum NvmeStatusCodes {
 NVME_CONFLICTING_ATTRS  = 0x0180,
 NVME_INVALID_PROT_INFO  = 0x0181,
 NVME_WRITE_TO_RO= 0x0182,
+NVME_ZONE_BOUNDARY_ERROR= 0x01b8,
+NVME_ZONE_FULL  = 0x01b9,
+NVME_ZONE_READ_ONLY = 0x01ba,
+NVME_ZONE_OFFLINE   = 0x01bb,
+NVME_ZONE_INVALID_WRITE = 0x01bc,
+NVME_ZONE_TOO_MANY_ACTIVE   = 0x01bd,
+NVME_ZONE_TOO_MANY_OPEN = 0x01be,
+NVME_ZONE_INVAL_TRANSITION  = 0x01bf,
 NVME_WRITE_FAULT= 0x0280,
 NVME_UNRECOVERED_READ   = 0x0281,
 NVME_E2E_GUARD_ERROR= 0x0282,
@@ -887,6 +903,11 @@ typedef struct QEMU_PACKED NvmeIdCtrl {
 uint8_t vs[1024];
 } NvmeIdCtrl;
 
+typedef struct NvmeIdCtrlZoned {
+uint8_t zasl;
+uint8_t rsvd1[4095];
+} NvmeIdCtrlZoned;
+
 enum NvmeIdCtrlOacs {
 NVME_OACS_SECURITY  = 1 << 0,
 NVME_OACS_FORMAT= 1 << 1,
@@ -1012,6 +1033,12 @@ typedef struct QEMU_PACKED NvmeLBAF {
 uint8_t rp;
 } NvmeLBAF;
 
+typedef struct QEMU_PACKED NvmeLBAFE {
+uint64_tzsze;
+uint8_t zdes;
+uint8_t rsvd9[7];
+} NvmeLBAFE;
+
 #define NVME_NSID_BROADCAST 0x
 
 typedef struct QEMU_PACKED NvmeIdNs {
@@ -1066,10 +1093,24 @@ enum NvmeNsIdentifierType {
 
 enum NvmeCsi {
 NVME_CSI_NVM= 0x00,
+NVME_CSI_ZONED  = 0x02,
 };
 
 #define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi)))
 
+typedef struct QEMU_PACKED NvmeIdNsZoned {
+uint16_tzoc;
+uint16_tozcs;
+uint32_tmar;
+uint32_tmor;
+uint32_trrl;
+uint32_tfrl;
+uint8_t rsvd20[2796];
+NvmeLBAFE   lbafe[16];
+uint8_t rsvd3072[768];
+uint8_t vs[256];
+} NvmeIdNsZoned;
+
 /*Deallocate Logical Block Features*/
 #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)   ((dlfeat) & 0x10)
 #define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat)((dlfeat) & 0x08)
@@ -1101,10 +1142,76 @@ enum NvmeIdNsDps {
 DPS_FIRST_EIGHT = 8,
 };
 
+enum NvmeZoneAttr {
+NVME_ZA_FINISHED_BY_CTLR = 1 << 0,
+NVME_ZA_FINISH_RECOMMENDED   = 1 << 1,
+NVME_ZA_RESET_RECOMMENDED= 1 << 2,
+NVME_ZA_ZD_EXT_VALID = 1 << 7,
+};
+
+typedef struct QEMU_PACKED NvmeZoneReportHeader {
+uint64_tnr_zones;
+uint8_t rsvd[56];
+} NvmeZoneReportHeader;
+
+enum NvmeZoneReceiveAction {
+NVME_ZONE_REPORT = 0,
+NVME_ZONE_REPORT_EXTENDED= 1,
+};
+
+enum NvmeZoneReportType {
+NVME_ZONE_REPORT_ALL = 0,
+NVME_ZONE_REPORT_EMPTY   = 1,
+NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2,
+NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3,
+NVME_ZONE_REPORT_CLOSED  = 4,
+NVME_ZONE_REPORT_FULL= 5,
+NVME_ZONE_REPORT_READ_ONLY   = 6,
+NVME_ZONE_REPORT_OFFLINE = 7,
+};
+
+enum NvmeZoneType {
+NVME_ZONE_TYPE_RESERVED  = 0x00,
+NVME_ZONE_TYPE_SEQ_WRITE = 0x02,
+};
+
+enum NvmeZoneSendAction {
+NVME_ZONE_ACTION_RSD = 0x00,
+NVME_ZONE_ACTION_CLOSE   = 0x01,
+NVME_ZONE_ACTION_FINISH  = 0x02,
+NVME_ZONE_ACTION_OPEN= 0x03,
+NVME_ZONE_ACTION_RESET   = 0x04,
+NVME_ZONE_ACTION_OFFLINE = 0x05,
+NVME_ZONE_ACTION_SET_ZD_EXT  = 0x10,
+};
+
+typedef struct QEMU_PACKED NvmeZoneDescr {
+uint8_t zt;
+uint8_t zs;
+uint8_t za

[PATCH v9 05/12] hw/block/nvme: Add support for Namespace Types

2020-11-04 Thread Dmitry Fomichev
From: Niklas Cassel 

Define the structures and constants required to implement
Namespace Types support.

Namespace Types introduce a new command set, "I/O Command Sets",
that allows the host to retrieve the command sets associated with
a namespace. Introduce support for the command set and enable
detection for the NVM Command Set.

The new workflows for identify commands rely heavily on zero-filled
identify structs. E.g., certain CNS commands are defined to return
a zero-filled identify struct when an inactive namespace NSID
is supplied.

Add a helper function in order to avoid code duplication when
reporting zero-filled identify structures.

Signed-off-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Keith Busch 
---
 hw/block/nvme-ns.h|   1 +
 include/block/nvme.h  |  66 +++
 hw/block/nvme-ns.c|   2 +
 hw/block/nvme.c   | 188 +++---
 hw/block/trace-events |   7 ++
 5 files changed, 219 insertions(+), 45 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index a38071884a..d795e44bab 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -31,6 +31,7 @@ typedef struct NvmeNamespace {
 int64_t  size;
 NvmeIdNs id_ns;
 const uint32_t *iocs;
+uint8_t  csi;
 
 NvmeNamespaceParams params;
 } NvmeNamespace;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index f62cc90d49..af23514713 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -84,6 +84,7 @@ enum NvmeCapMask {
 
 enum NvmeCapCss {
 NVME_CAP_CSS_NVM= 1 << 0,
+NVME_CAP_CSS_CSI_SUPP   = 1 << 6,
 NVME_CAP_CSS_ADMIN_ONLY = 1 << 7,
 };
 
@@ -117,9 +118,25 @@ enum NvmeCcMask {
 
 enum NvmeCcCss {
 NVME_CC_CSS_NVM= 0x0,
+NVME_CC_CSS_CSI= 0x6,
 NVME_CC_CSS_ADMIN_ONLY = 0x7,
 };
 
+#define NVME_SET_CC_EN(cc, val) \
+(cc |= (uint32_t)((val) & CC_EN_MASK) << CC_EN_SHIFT)
+#define NVME_SET_CC_CSS(cc, val)\
+(cc |= (uint32_t)((val) & CC_CSS_MASK) << CC_CSS_SHIFT)
+#define NVME_SET_CC_MPS(cc, val)\
+(cc |= (uint32_t)((val) & CC_MPS_MASK) << CC_MPS_SHIFT)
+#define NVME_SET_CC_AMS(cc, val)\
+(cc |= (uint32_t)((val) & CC_AMS_MASK) << CC_AMS_SHIFT)
+#define NVME_SET_CC_SHN(cc, val)\
+(cc |= (uint32_t)((val) & CC_SHN_MASK) << CC_SHN_SHIFT)
+#define NVME_SET_CC_IOSQES(cc, val) \
+(cc |= (uint32_t)((val) & CC_IOSQES_MASK) << CC_IOSQES_SHIFT)
+#define NVME_SET_CC_IOCQES(cc, val) \
+(cc |= (uint32_t)((val) & CC_IOCQES_MASK) << CC_IOCQES_SHIFT)
+
 enum NvmeCstsShift {
 CSTS_RDY_SHIFT  = 0,
 CSTS_CFS_SHIFT  = 1,
@@ -534,8 +551,13 @@ typedef struct QEMU_PACKED NvmeIdentify {
 uint64_trsvd2[2];
 uint64_tprp1;
 uint64_tprp2;
-uint32_tcns;
-uint32_trsvd11[5];
+uint8_t cns;
+uint8_t rsvd10;
+uint16_tctrlid;
+uint16_tnvmsetid;
+uint8_t rsvd11;
+uint8_t csi;
+uint32_trsvd12[4];
 } NvmeIdentify;
 
 typedef struct QEMU_PACKED NvmeRwCmd {
@@ -656,6 +678,7 @@ enum NvmeStatusCodes {
 NVME_SGL_DESCR_TYPE_INVALID = 0x0011,
 NVME_INVALID_USE_OF_CMB = 0x0012,
 NVME_INVALID_PRP_OFFSET = 0x0013,
+NVME_CMD_SET_CMB_REJECTED   = 0x002b,
 NVME_LBA_RANGE  = 0x0080,
 NVME_CAP_EXCEEDED   = 0x0081,
 NVME_NS_NOT_READY   = 0x0082,
@@ -782,11 +805,15 @@ typedef struct QEMU_PACKED NvmePSD {
 
 #define NVME_IDENTIFY_DATA_SIZE 4096
 
-enum {
-NVME_ID_CNS_NS = 0x0,
-NVME_ID_CNS_CTRL   = 0x1,
-NVME_ID_CNS_NS_ACTIVE_LIST = 0x2,
-NVME_ID_CNS_NS_DESCR_LIST  = 0x3,
+enum NvmeIdCns {
+NVME_ID_CNS_NS= 0x00,
+NVME_ID_CNS_CTRL  = 0x01,
+NVME_ID_CNS_NS_ACTIVE_LIST= 0x02,
+NVME_ID_CNS_NS_DESCR_LIST = 0x03,
+NVME_ID_CNS_CS_NS = 0x05,
+NVME_ID_CNS_CS_CTRL   = 0x06,
+NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
+NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
 };
 
 typedef struct QEMU_PACKED NvmeIdCtrl {
@@ -934,6 +961,7 @@ enum NvmeFeatureIds {
 NVME_WRITE_ATOMICITY= 0xa,
 NVME_ASYNCHRONOUS_EVENT_CONF= 0xb,
 NVME_TIMESTAMP  = 0xe,
+NVME_COMMAND_SET_PROFILE= 0x19,
 NVME_SOFTWARE_PROGRESS_MARKER   = 0x80,
 NVME_FID_MAX= 0x100,
 };
@@ -1018,18 +1046,26 @@ typedef struct QEMU_PACKED NvmeIdNsDescr {
 uint8_t rsvd2[2];
 } NvmeIdNsDescr;
 
-enum {
-NVME_NIDT_EUI64_LEN =  8,
-NVME_NIDT_NGUID_LEN = 16,
-NVME_NIDT_UUID_LEN  = 16,
+enum NvmeNsIdentifierLength {
+NVME_NIDL_EUI64 = 8,
+NVME_NIDL_NGUID = 16,
+NVME_NIDL_UUID  = 16,
+NVME_NIDL_CSI   = 1,
 };
 
 enum NvmeNsIdentifierType {
-NVME_NIDT_EUI64 = 0x1,
-NVME_NIDT_NGUID = 0x2,
-NVME_

[PATCH v9 02/12] hw/block/nvme: Generate namespace UUIDs

2020-11-04 Thread Dmitry Fomichev
In NVMe 1.4, a namespace must report an ID descriptor of UUID type
if it doesn't support EUI64 or NGUID. Add a new namespace property,
"uuid", that provides the user the option to either specify the UUID
explicitly or have a UUID generated automatically every time a
namespace is initialized.

Suggested-by: Klaus Jensen 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Klaus Jensen 
Reviewed-by: Keith Busch 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h | 1 +
 hw/block/nvme-ns.c | 1 +
 hw/block/nvme.c| 9 +
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index ea8c2f785d..a38071884a 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -21,6 +21,7 @@
 
 typedef struct NvmeNamespaceParams {
 uint32_t nsid;
+QemuUUID uuid;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index b69cdaf27e..de735eb9f3 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -129,6 +129,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
 static Property nvme_ns_props[] = {
 DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
 DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
+DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 702f7cc2e3..ed3f38f01d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1564,6 +1564,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 
 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
 {
+NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
 uint32_t nsid = le32_to_cpu(c->nsid);
 uint8_t list[NVME_IDENTIFY_DATA_SIZE];
@@ -1583,7 +1584,8 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_NSID | NVME_DNR;
 }
 
-if (unlikely(!nvme_ns(n, nsid))) {
+ns = nvme_ns(n, nsid);
+if (unlikely(!ns)) {
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
@@ -1592,12 +1594,11 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl 
*n, NvmeRequest *req)
 /*
  * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data
  * structure, a Namespace UUID (nidt = 0x3) must be reported in the
- * Namespace Identification Descriptor. Add a very basic Namespace UUID
- * here.
+ * Namespace Identification Descriptor. Add the namespace UUID here.
  */
 ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID;
 ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN;
-stl_be_p(_descrs->uuid.v, nsid);
+memcpy(_descrs->uuid.v, ns->params.uuid.data, NVME_NIDT_UUID_LEN);
 
 return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE,
 DMA_DIRECTION_FROM_DEVICE, req);
-- 
2.21.0




[PATCH v9 03/12] hw/block/nvme: Separate read and write handlers

2020-11-04 Thread Dmitry Fomichev
With ZNS support in place, the majority of code in nvme_rw() has
become read- or write-specific. Move these parts to two separate
handlers, nvme_read() and nvme_write() to make the code more
readable and to remove multiple is_write checks that so far existed
in the i/o path.

This is a refactoring patch, no change in functionality.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
Acked-by: Klaus Jensen 
---
 hw/block/nvme.c   | 91 ++-
 hw/block/trace-events |  3 +-
 2 files changed, 67 insertions(+), 27 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index ed3f38f01d..770e42a066 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -953,6 +953,54 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
+static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
+NvmeNamespace *ns = req->ns;
+uint64_t slba = le64_to_cpu(rw->slba);
+uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
+uint64_t data_size = nvme_l2b(ns, nlb);
+uint64_t data_offset;
+BlockBackend *blk = ns->blkconf.blk;
+uint16_t status;
+
+trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, data_size, slba);
+
+status = nvme_check_mdts(n, data_size);
+if (status) {
+trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+goto invalid;
+}
+
+status = nvme_check_bounds(n, ns, slba, nlb);
+if (status) {
+trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
+goto invalid;
+}
+
+status = nvme_map_dptr(n, data_size, req);
+if (status) {
+goto invalid;
+}
+
+data_offset = nvme_l2b(ns, slba);
+
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_READ);
+if (req->qsg.sg) {
+req->aiocb = dma_blk_read(blk, >qsg, data_offset,
+  BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+} else {
+req->aiocb = blk_aio_preadv(blk, data_offset, >iov, 0,
+nvme_rw_cb, req);
+}
+return NVME_NO_COMPLETE;
+
+invalid:
+block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
+return status | NVME_DNR;
+}
+
 static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
@@ -978,22 +1026,19 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
-static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
 NvmeNamespace *ns = req->ns;
-uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
 uint64_t slba = le64_to_cpu(rw->slba);
-
+uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
 uint64_t data_size = nvme_l2b(ns, nlb);
-uint64_t data_offset = nvme_l2b(ns, slba);
-enum BlockAcctType acct = req->cmd.opcode == NVME_CMD_WRITE ?
-BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
+uint64_t data_offset;
 BlockBackend *blk = ns->blkconf.blk;
 uint16_t status;
 
-trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode),
-  nvme_nsid(ns), nlb, data_size, slba);
+trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
+ nvme_nsid(ns), nlb, data_size, slba);
 
 status = nvme_check_mdts(n, data_size);
 if (status) {
@@ -1012,29 +1057,22 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
 goto invalid;
 }
 
-block_acct_start(blk_get_stats(blk), >acct, data_size, acct);
+data_offset = nvme_l2b(ns, slba);
+
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_WRITE);
 if (req->qsg.sg) {
-if (acct == BLOCK_ACCT_WRITE) {
-req->aiocb = dma_blk_write(blk, >qsg, data_offset,
-   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
-} else {
-req->aiocb = dma_blk_read(blk, >qsg, data_offset,
-  BDRV_SECTOR_SIZE, nvme_rw_cb, req);
-}
+req->aiocb = dma_blk_write(blk, >qsg, data_offset,
+   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
 } else {
-if (acct == BLOCK_ACCT_WRITE) {
-req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
- nvme_rw_cb, req);
-} else {
-req->aiocb = blk_aio_preadv(blk, data_offset, >iov, 0,
-nvme_rw_cb, req);
-}
+req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
+ nvme_rw_cb, req);
 }
 return NVME_NO_COMPLETE;
 
 invalid:
-block_acct_invalid(blk_get_stats(ns->blkconf.blk), acct)

[PATCH v9 00/12] hw/block/nvme: Support Namespace Types and Zoned Namespace Command Set

2020-11-04 Thread Dmitry Fomichev
d there to output empty namespace identify structs.
 - Fixed typo in Maxim's email address.

v1 -> v2:

 - Rebased on top of qemu-nvme/next branch.
 - Incorporated feedback from Klaus and Alistair.
* Allow a subset of CSE log to be read, not the entire log
* Assign admin command entries in CSE log to ACS fields
* Set LPA bit 1 to indicate support of CSE log page
* Rename CC.CSS value CSS_ALL_NSTYPES (110b) to CSS_CSI
* Move the code to assign lbaf.ds to a separate patch
* Remove the change in firmware revision
* Change "driver" to "device" in comments and annotations
* Rename ZAMDS to ZASL
* Correct a few format expressions and some wording in
  trace event definitions
* Remove validation code to return NVME_CAP_EXCEEDED error
* Make ZASL to be equal to MDTS if "zone_append_size_limit"
  module parameter is not set
* Clean up nvme_zoned_init_ctrl() to make size calculations
  less confusing
* Avoid changing module parameters, use separate n/s variables
  if additional calculations are necessary to convert parameters
  to running values
* Use NVME_DEFAULT_ZONE_SIZE to assign the default zone size value
* Use default 0 for zone capacity meaning that zone capacity will
  be equal to zone size by default
* Issue warnings if user MAR/MOR values are too large and have
  to be adjusted
* Use unsigned values for MAR/MOR
 - Dropped "Simulate Zone Active excursions" patch.
   Excursion behavior may depend on the internal controller
   architecture and therefore be vendor-specific.
 - Dropped support for Zone Attributes and zoned AENs for now.
   These features can be added in a future series.
 - NS Types support is extended to handle active/inactive namespaces.
 - Update the write pointer after backing storage I/O completion, not
   before. This makes the emulation to run correctly in case of
   backing device failures.
 - Avoid division in the I/O path if the device zone size is
   a power of two (the most common case). Zone index then can be
   calculated by using bit shift.
 - A few reported bugs have been fixed.
 - Indentation in function definitions has been changed to make it
   the same as the rest of the code.


Zoned Namespace (ZNS) Command Set is a newly introduced command set
published by the NVM Express, Inc. organization as TP 4053. The main
design goals of ZNS are to provide hardware designers the means to
reduce NVMe controller complexity and to allow achieving a better I/O
latency and throughput. SSDs that implement this interface are
commonly known as ZNS SSDs.

This command set is implementing a zoned storage model, similarly to
ZAC/ZBC. As such, there is already support in Linux, allowing one to
perform the majority of tasks needed for managing ZNS SSDs.

The Zoned Namespace Command Set relies on another TP, known as
Namespace Types (NVMe TP 4056), which introduces support for having
multiple command sets per namespace.

Both ZNS and Namespace Types specifications can be downloaded by
visiting the following link -

https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs.zip

This patch series adds Namespace Types support and zoned namespace
emulation capability to the existing NVMe PCI device.

Based-on: <20201104102248.32168-1-...@irrelevant.dk>

Dmitry Fomichev (10):
  hw/block/nvme: Add Commands Supported and Effects log
  hw/block/nvme: Generate namespace UUIDs
  hw/block/nvme: Separate read and write handlers
  hw/block/nvme: Merge nvme_write_zeroes() with nvme_write()
  block/nvme: Make ZNS-related definitions
  hw/block/nvme: Support Zoned Namespace Command Set
  hw/block/nvme: Introduce max active and open zone limits
  hw/block/nvme: Support Zone Descriptor Extensions
  hw/block/nvme: Add injection of Offline/Read-Only zones
  hw/block/nvme: Document zoned parameters in usage text

Niklas Cassel (2):
  hw/block/nvme: Add support for Namespace Types
  hw/block/nvme: Support allocated CNS command variants

 hw/block/nvme-ns.h|  109 +++
 hw/block/nvme.h   |8 +
 include/block/nvme.h  |  205 +-
 hw/block/nvme-ns.c|  276 +++
 hw/block/nvme.c   | 1590 ++---
 hw/block/trace-events |   32 +-
 6 files changed, 2095 insertions(+), 125 deletions(-)

-- 
2.21.0




[PATCH v9 01/12] hw/block/nvme: Add Commands Supported and Effects log

2020-11-04 Thread Dmitry Fomichev
This log page becomes necessary to implement to allow checking for
Zone Append command support in Zoned Namespace Command Set.

This commit adds the code to report this log page for NVM Command
Set only. The parts that are specific to zoned operation will be
added later in the series.

All incoming admin and i/o commands are now only processed if their
corresponding support bits are set in this log. This provides an
easy way to control what commands to support and what not to
depending on set CC.CSS.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h|  1 +
 include/block/nvme.h  | 19 +
 hw/block/nvme.c   | 96 +++
 hw/block/trace-events |  1 +
 4 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 83734f4606..ea8c2f785d 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -29,6 +29,7 @@ typedef struct NvmeNamespace {
 int32_t  bootindex;
 int64_t  size;
 NvmeIdNs id_ns;
+const uint32_t *iocs;
 
 NvmeNamespaceParams params;
 } NvmeNamespace;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 8a46d9cf01..f62cc90d49 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -745,10 +745,27 @@ enum NvmeSmartWarn {
 NVME_SMART_FAILED_VOLATILE_MEDIA  = 1 << 4,
 };
 
+typedef struct NvmeEffectsLog {
+uint32_tacs[256];
+uint32_tiocs[256];
+uint8_t resv[2048];
+} NvmeEffectsLog;
+
+enum {
+NVME_CMD_EFF_CSUPP  = 1 << 0,
+NVME_CMD_EFF_LBCC   = 1 << 1,
+NVME_CMD_EFF_NCC= 1 << 2,
+NVME_CMD_EFF_NIC= 1 << 3,
+NVME_CMD_EFF_CCC= 1 << 4,
+NVME_CMD_EFF_CSE_MASK   = 3 << 16,
+NVME_CMD_EFF_UUID_SEL   = 1 << 19,
+};
+
 enum NvmeLogIdentifier {
 NVME_LOG_ERROR_INFO = 0x01,
 NVME_LOG_SMART_INFO = 0x02,
 NVME_LOG_FW_SLOT_INFO   = 0x03,
+NVME_LOG_CMD_EFFECTS= 0x05,
 };
 
 typedef struct QEMU_PACKED NvmePSD {
@@ -861,6 +878,7 @@ enum NvmeIdCtrlFrmw {
 
 enum NvmeIdCtrlLpa {
 NVME_LPA_NS_SMART = 1 << 0,
+NVME_LPA_CSE  = 1 << 1,
 NVME_LPA_EXTENDED = 1 << 2,
 };
 
@@ -1060,6 +1078,7 @@ static inline void _nvme_check_size(void)
 QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
 QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
+QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096);
 QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
 QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
 QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16);
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 065e763e4f..702f7cc2e3 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -111,6 +111,28 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE,
 };
 
+static const uint32_t nvme_cse_acs[256] = {
+[NVME_ADM_CMD_DELETE_SQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_CREATE_SQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_DELETE_CQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_CREATE_CQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_ABORT]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
+};
+
+static const uint32_t nvme_cse_iocs_none[256];
+
+static const uint32_t nvme_cse_iocs_nvm[256] = {
+[NVME_CMD_FLUSH]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_WRITE]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
+};
+
 static void nvme_process_sq(void *opaque);
 
 static uint16_t nvme_cid(NvmeRequest *req)
@@ -1022,10 +1044,6 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
   req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
 
-if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_ADMIN_ONLY) {
-return NVME_INVALID_OPCODE | NVME_DNR;
-}
-
 if (!nvme_nsid_valid(n, nsid)) {
 return NVME_INVALID_NSID | NVME_DNR;
 }
@@ -1035,6 +1053,11 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
+if (!(req->ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
+trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
+return NVME_INVALID_OPCODE | NVME_DNR;
+}
+
 switch (req->cmd.opcode) {
 case NVME_CMD_FLUSH:
 return nvme_flush(n, re

[PATCH v9 04/12] hw/block/nvme: Merge nvme_write_zeroes() with nvme_write()

2020-11-04 Thread Dmitry Fomichev
nvme_write() now handles WRITE, WRITE ZEROES and ZONE_APPEND.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
Acked-by: Klaus Jensen 
---
 hw/block/nvme.c   | 72 +--
 hw/block/trace-events |  1 -
 2 files changed, 28 insertions(+), 45 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 770e42a066..48adbe84f5 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1001,32 +1001,7 @@ invalid:
 return status | NVME_DNR;
 }
 
-static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
-{
-NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
-NvmeNamespace *ns = req->ns;
-uint64_t slba = le64_to_cpu(rw->slba);
-uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
-uint64_t offset = nvme_l2b(ns, slba);
-uint32_t count = nvme_l2b(ns, nlb);
-uint16_t status;
-
-trace_pci_nvme_write_zeroes(nvme_cid(req), nvme_nsid(ns), slba, nlb);
-
-status = nvme_check_bounds(n, ns, slba, nlb);
-if (status) {
-trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
-return status;
-}
-
-block_acct_start(blk_get_stats(req->ns->blkconf.blk), >acct, 0,
- BLOCK_ACCT_WRITE);
-req->aiocb = blk_aio_pwrite_zeroes(req->ns->blkconf.blk, offset, count,
-   BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
-return NVME_NO_COMPLETE;
-}
-
-static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req, bool wrz)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
 NvmeNamespace *ns = req->ns;
@@ -1040,10 +1015,12 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
  nvme_nsid(ns), nlb, data_size, slba);
 
-status = nvme_check_mdts(n, data_size);
-if (status) {
-trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
-goto invalid;
+if (!wrz) {
+status = nvme_check_mdts(n, data_size);
+if (status) {
+trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+goto invalid;
+}
 }
 
 status = nvme_check_bounds(n, ns, slba, nlb);
@@ -1052,21 +1029,28 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest 
*req)
 goto invalid;
 }
 
-status = nvme_map_dptr(n, data_size, req);
-if (status) {
-goto invalid;
-}
-
 data_offset = nvme_l2b(ns, slba);
 
-block_acct_start(blk_get_stats(blk), >acct, data_size,
- BLOCK_ACCT_WRITE);
-if (req->qsg.sg) {
-req->aiocb = dma_blk_write(blk, >qsg, data_offset,
-   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+if (!wrz) {
+status = nvme_map_dptr(n, data_size, req);
+if (status) {
+goto invalid;
+}
+
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_WRITE);
+if (req->qsg.sg) {
+req->aiocb = dma_blk_write(blk, >qsg, data_offset,
+   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+} else {
+req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
+ nvme_rw_cb, req);
+}
 } else {
-req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
- nvme_rw_cb, req);
+block_acct_start(blk_get_stats(blk), >acct, 0, BLOCK_ACCT_WRITE);
+req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
+   BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
+   req);
 }
 return NVME_NO_COMPLETE;
 
@@ -1100,9 +1084,9 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
 case NVME_CMD_FLUSH:
 return nvme_flush(n, req);
 case NVME_CMD_WRITE_ZEROES:
-return nvme_write_zeroes(n, req);
+return nvme_write(n, req, true);
 case NVME_CMD_WRITE:
-return nvme_write(n, req);
+return nvme_write(n, req, false);
 case NVME_CMD_READ:
 return nvme_read(n, req);
 default:
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 540c600931..e67e96c2b5 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -43,7 +43,6 @@ pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t 
opcode, const char *opna
 pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, 
uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 
0x%"PRIx64""
 pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, 
uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb 
%"PRIu32" count %"PRIu64" lba 0x%"P

RE: [PATCH v8 07/11] hw/block/nvme: Support Zoned Namespace Command Set

2020-11-03 Thread Dmitry Fomichev
> -Original Message-
> From: Philippe Mathieu-Daudé 
> Sent: Friday, October 30, 2020 3:16 AM
> To: Dmitry Fomichev ; Keith Busch
> ; Klaus Jensen ; Kevin Wolf
> ; Maxim Levitsky ; Fam Zheng
> 
> Cc: Alistair Francis ; Matias Bjorling
> ; Niklas Cassel ;
> Damien Le Moal ; qemu-bl...@nongnu.org;
> qemu-devel@nongnu.org
> Subject: Re: [PATCH v8 07/11] hw/block/nvme: Support Zoned Namespace
> Command Set
> 
> Hi Dmitry,
> 
> On 10/30/20 3:32 AM, Dmitry Fomichev wrote:
> > The emulation code has been changed to advertise NVM Command Set
> when
> > "zoned" device property is not set (default) and Zoned Namespace
> > Command Set otherwise.
> >
> > Define values and structures that are needed to support Zoned
> > Namespace Command Set (NVMe TP 4053) in PCI NVMe controller
> emulator.
> > Define trace events where needed in newly introduced code.
> >
> > In order to improve scalability, all open, closed and full zones
> > are organized in separate linked lists. Consequently, almost all
> > zone operations don't require scanning of the entire zone array
> > (which potentially can be quite large) - it is only necessary to
> > enumerate one or more zone lists.
> >
> > Handlers for three new NVMe commands introduced in Zoned Namespace
> > Command Set specification are added, namely for Zone Management
> > Receive, Zone Management Send and Zone Append.
> >
> > Device initialization code has been extended to create a proper
> > configuration for zoned operation using device properties.
> >
> > Read/Write command handler is modified to only allow writes at the
> > write pointer if the namespace is zoned. For Zone Append command,
> > writes implicitly happen at the write pointer and the starting write
> > pointer value is returned as the result of the command. Write Zeroes
> > handler is modified to add zoned checks that are identical to those
> > done as a part of Write flow.
> >
> > Subsequent commits in this series add ZDE support and checks for
> > active and open zone limits.
> >
> > Signed-off-by: Niklas Cassel 
> > Signed-off-by: Hans Holmberg 
> > Signed-off-by: Ajay Joshi 
> > Signed-off-by: Chaitanya Kulkarni 
> > Signed-off-by: Matias Bjorling 
> > Signed-off-by: Aravind Ramesh 
> > Signed-off-by: Shin'ichiro Kawasaki 
> > Signed-off-by: Adam Manzanares 
> > Signed-off-by: Dmitry Fomichev 
> > Reviewed-by: Niklas Cassel 
> > ---
> >  block/nvme.c  |   2 +-
> >  hw/block/nvme-ns.c| 173 
> >  hw/block/nvme-ns.h|  54 +++
> >  hw/block/nvme.c   | 977
> +-
> >  hw/block/nvme.h   |   8 +
> >  hw/block/trace-events |  18 +-
> >  include/block/nvme.h  | 113 -
> 
> When you start modifying include/ files, it is recommended
> to start using scripts/git.orderfile as this makes review
> easier (no need to scroll back / up constantly).
>

Hi Philippe,

Thanks for the suggestion, indeed it makes browsing through
diffs much easier!

> As "block/nvme.h" is shared by 2 subsystems, keeping its
> changes in a separate patch is preferred.
>

No problem, I'll move all changes to include/block/nvme.h to
a separate patch.
 
> >  7 files changed, 1322 insertions(+), 23 deletions(-)
> >
> > diff --git a/block/nvme.c b/block/nvme.c
> > index 05485fdd11..7a513c9a17 100644
> > --- a/block/nvme.c
> > +++ b/block/nvme.c
> > @@ -333,7 +333,7 @@ static inline int nvme_translate_error(const
> NvmeCqe *c)
> >  {
> >  uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
> >  if (status) {
> > -trace_nvme_error(le32_to_cpu(c->result),
> > +trace_nvme_error(le32_to_cpu(c->result32),
> >   le16_to_cpu(c->sq_head),
> >   le16_to_cpu(c->sq_id),
> >   le16_to_cpu(c->cid),
> ...
> 
> > diff --git a/include/block/nvme.h b/include/block/nvme.h
> > index 3653b4aefc..ba8a45edf5 100644
> > --- a/include/block/nvme.h
> > +++ b/include/block/nvme.h
> > @@ -489,6 +489,9 @@ enum NvmeIoCommands {
> >  NVME_CMD_COMPARE= 0x05,
> >  NVME_CMD_WRITE_ZEROES   = 0x08,
> >  NVME_CMD_DSM= 0x09,
> > +NVME_CMD_ZONE_MGMT_SEND = 0x79,
> > +NVME_CMD_ZONE_MGMT_RECV = 0x7a,
> > +NVME_CMD_ZONE_APPEND= 0x7d,
> >  };
> >
> >  typedef struct QEMU_PACKED NvmeDeleteQ {
> > @@ -649,8 +652,10 @@ typedef struct QEMU_PACKED NvmeA

[PATCH v8 11/11] hw/block/nvme: Document zoned parameters in usage text

2020-10-29 Thread Dmitry Fomichev
Added brief descriptions of the new device properties that are
now available to users to configure features of Zoned Namespace
Command Set in the emulator.

This patch is for documentation only, no functionality change.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme.c | 47 ++-
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 339becd3e2..10f5c752ba 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -9,7 +9,7 @@
  */
 
 /**
- * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
+ * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
  *
  *  https://nvmexpress.org/developers/nvme-specification/
  */
@@ -22,8 +22,9 @@
  *  [pmrdev=,] \
  *  max_ioqpairs=, \
  *  aerl=, aer_max_queued=, \
- *  mdts=
- *  -device nvme-ns,drive=,bus=bus_name,nsid=
+ *  mdts=,zoned.append_size_limit= \
+ *  -device nvme-ns,drive=,bus=,nsid=,\
+ *  zoned=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
@@ -41,14 +42,50 @@
  * ~~
  * - `aerl`
  *   The Asynchronous Event Request Limit (AERL). Indicates the maximum number
- *   of concurrently outstanding Asynchronous Event Request commands suppoert
+ *   of concurrently outstanding Asynchronous Event Request commands support
  *   by the controller. This is a 0's based value.
  *
  * - `aer_max_queued`
  *   This is the maximum number of events that the device will enqueue for
- *   completion when there are no oustanding AERs. When the maximum number of
+ *   completion when there are no outstanding AERs. When the maximum number of
  *   enqueued events are reached, subsequent events will be dropped.
  *
+ * - `zoned.append_size_limit`
+ *   The maximum I/O size in bytes that is allowed in Zone Append command.
+ *   The default is 128KiB. Since internally this this value is maintained as
+ *   ZASL = log2( / ), some values assigned
+ *   to this property may be rounded down and result in a lower maximum ZA
+ *   data size being in effect. By setting this property to 0, users can make
+ *   ZASL to be equal to MDTS. This property only affects zoned namespaces.
+ *
+ * Setting `zoned` to true selects Zoned Command Set at the namespace.
+ * In this case, the following namespace properties are available to configure
+ * zoned operation:
+ * zoned.zsze=
+ * The number may be followed by K, M, G as in kilo-, mega- or giga-.
+ *
+ * zoned.zcap=
+ * The value 0 (default) forces zone capacity to be the same as zone
+ * size. The value of this property may not exceed zone size.
+ *
+ * zoned.descr_ext_size=
+ * This value needs to be specified in 64B units. If it is zero,
+ * namespace(s) will not support zone descriptor extensions.
+ *
+ * zoned.max_active=
+ * The default value means there is no limit to the number of
+ * concurrently active zones.
+ *
+ * zoned.max_open=
+ * The default value means there is no limit to the number of
+ * concurrently open zones.
+ *
+ * zoned.offline_zones=
+ *
+ * zoned.rdonly_zones=
+ *
+ * zoned.cross_zone_read=
+ * Setting this property to true enables Read Across Zone Boundaries.
  */
 
 #include "qemu/osdep.h"
-- 
2.21.0




[PATCH v8 06/11] hw/block/nvme: Support allocated CNS command variants

2020-10-29 Thread Dmitry Fomichev
From: Niklas Cassel 

Many CNS commands have "allocated" command variants. These include
a namespace as long as it is allocated, that is a namespace is
included regardless if it is active (attached) or not.

While these commands are optional (they are mandatory for controllers
supporting the namespace attachment command), our QEMU implementation
is more complete by actually providing support for these CNS values.

However, since our QEMU model currently does not support the namespace
attachment command, these new allocated CNS commands will return the
same result as the active CNS command variants.

In NVMe, a namespace is active if it exists and is attached to the
controller.

Add a new Boolean namespace flag, "attached", to provide the most
basic namespace attachment support. The default value for this new
flag is true. Also, implement the logic in the new CNS values to
include/exclude namespaces based on this new property. The only thing
missing is hooking up the actual Namespace Attachment command opcode,
which will allow a user to toggle the "attached" flag per namespace.

The reason for not hooking up this command completely is because the
NVMe specification requires the namespace management command to be
supported if the namespace attachment command is supported.

Signed-off-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Keith Busch 
---
 hw/block/nvme-ns.c   |  1 +
 hw/block/nvme-ns.h   |  1 +
 hw/block/nvme.c  | 68 
 include/block/nvme.h | 20 +++--
 4 files changed, 70 insertions(+), 20 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index c0362426cc..e191ef9be0 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -42,6 +42,7 @@ static void nvme_ns_init(NvmeNamespace *ns)
 id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));
 
 ns->csi = NVME_CSI_NVM;
+ns->attached = true;
 
 /* no thin provisioning */
 id_ns->ncap = id_ns->nsze;
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index d795e44bab..2d9cd29d07 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -31,6 +31,7 @@ typedef struct NvmeNamespace {
 int64_t  size;
 NvmeIdNs id_ns;
 const uint32_t *iocs;
+bool attached;
 uint8_t  csi;
 
 NvmeNamespaceParams params;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d9e9fd264c..7b3eafad03 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1084,6 +1084,9 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
 if (unlikely(!req->ns)) {
 return NVME_INVALID_FIELD | NVME_DNR;
 }
+if (!req->ns->attached) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
 
 if (!(req->ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
@@ -1245,6 +1248,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, 
uint32_t buf_len,
 uint32_t trans_len;
 NvmeNamespace *ns;
 time_t current_ms;
+int i;
 
 if (off >= sizeof(smart)) {
 return NVME_INVALID_FIELD | NVME_DNR;
@@ -1255,15 +1259,18 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t 
rae, uint32_t buf_len,
 if (!ns) {
 return NVME_INVALID_NSID | NVME_DNR;
 }
-nvme_set_blk_stats(ns, );
+if (ns->attached) {
+nvme_set_blk_stats(ns, );
+}
 } else {
-int i;
-
 for (i = 1; i <= n->num_namespaces; i++) {
 ns = nvme_ns(n, i);
 if (!ns) {
 continue;
 }
+if (!ns->attached) {
+continue;
+}
 nvme_set_blk_stats(ns, );
 }
 }
@@ -1560,7 +1567,8 @@ static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req,
+ bool only_active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -1577,11 +1585,16 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeRequest *req)
 return nvme_rpt_empty_id_struct(n, req);
 }
 
+if (only_active && !ns->attached) {
+return nvme_rpt_empty_id_struct(n, req);
+}
+
 return nvme_dma(n, (uint8_t *)>id_ns, sizeof(NvmeIdNs),
 DMA_DIRECTION_FROM_DEVICE, req);
 }
 
-static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
+ bool only_active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -1598,6 +1611,10 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, 
NvmeRequest *req)
 return nvme_rpt_empty_id_struct(n

[PATCH v8 09/11] hw/block/nvme: Support Zone Descriptor Extensions

2020-10-29 Thread Dmitry Fomichev
Zone Descriptor Extension is a label that can be assigned to a zone.
It can be set to an Empty zone and it stays assigned until the zone
is reset.

This commit adds a new optional module property,
"zoned.descr_ext_size". Its value must be a multiple of 64 bytes.
If this value is non-zero, it becomes possible to assign extensions
of that size to any Empty zones. The default value for this property
is 0, therefore setting extensions is disabled by default.

Signed-off-by: Hans Holmberg 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Klaus Jensen 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.c| 25 +++--
 hw/block/nvme-ns.h|  8 +++
 hw/block/nvme.c   | 51 +--
 hw/block/trace-events |  2 ++
 4 files changed, 82 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 2e45838c15..85dc73cf06 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -133,6 +133,18 @@ static int nvme_calc_zone_geometry(NvmeNamespace *ns, 
Error **errp)
 return -1;
 }
 
+if (ns->params.zd_extension_size) {
+if (ns->params.zd_extension_size & 0x3f) {
+error_setg(errp,
+"zone descriptor extension size must be a multiple of 64B");
+return -1;
+}
+if ((ns->params.zd_extension_size >> 6) > 0xff) {
+error_setg(errp, "zone descriptor extension size is too large");
+return -1;
+}
+}
+
 return 0;
 }
 
@@ -144,6 +156,10 @@ static void nvme_init_zone_state(NvmeNamespace *ns)
 int i;
 
 ns->zone_array = g_malloc0(ns->zone_array_size);
+if (ns->params.zd_extension_size) {
+ns->zd_extensions = g_malloc0(ns->params.zd_extension_size *
+  ns->num_zones);
+}
 
 QTAILQ_INIT(>exp_open_zones);
 QTAILQ_INIT(>imp_open_zones);
@@ -186,7 +202,8 @@ static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace 
*ns, int lba_index,
 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
 
 id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size);
-id_ns_z->lbafe[lba_index].zdes = 0;
+id_ns_z->lbafe[lba_index].zdes =
+ns->params.zd_extension_size >> 6; /* Units of 64B */
 
 ns->csi = NVME_CSI_ZONED;
 ns->id_ns.nsze = cpu_to_le64(ns->zone_size * ns->num_zones);
@@ -204,7 +221,8 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone 
*zone)
 
 zone->w_ptr = zone->d.wp;
 state = nvme_get_zone_state(zone);
-if (zone->d.wp != zone->d.zslba) {
+if (zone->d.wp != zone->d.zslba ||
+(zone->d.za & NVME_ZA_ZD_EXT_VALID)) {
 if (state != NVME_ZONE_STATE_CLOSED) {
 trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
 nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
@@ -301,6 +319,7 @@ void nvme_ns_cleanup(NvmeNamespace *ns)
 if (ns->params.zoned) {
 g_free(ns->id_ns_zoned);
 g_free(ns->zone_array);
+g_free(ns->zd_extensions);
 }
 }
 
@@ -332,6 +351,8 @@ static Property nvme_ns_props[] = {
params.max_active_zones, 0),
 DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace,
params.max_open_zones, 0),
+DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
+   params.zd_extension_size, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 421bab0a57..50a6a0e1ac 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -35,6 +35,7 @@ typedef struct NvmeNamespaceParams {
 uint64_t zone_cap_bs;
 uint32_t max_active_zones;
 uint32_t max_open_zones;
+uint32_t zd_extension_size;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -58,6 +59,7 @@ typedef struct NvmeNamespace {
 uint64_tzone_capacity;
 uint64_tzone_array_size;
 uint32_tzone_size_log2;
+uint8_t *zd_extensions;
 int32_t nr_open_zones;
 int32_t nr_active_zones;
 
@@ -127,6 +129,12 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone)
st != NVME_ZONE_STATE_OFFLINE;
 }
 
+static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns,
+ uint32_t zone_idx)
+{
+return >zd_extensions[zone_idx * ns->params.zd_extension_size];
+}
+
 static inline void nvme_aor_inc_open(NvmeNamespace *ns)
 {
 assert(ns->nr_open_zones >= 0);
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 485ac6fc40..339becd3e2 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1711,6 +1711,26 @@ static uint16_t nvme_offline_zone(NvmeNamespace *ns, 
NvmeZone *zone,
 return NVME_ZONE_INVAL_TRANSITION;
 }
 
+static uint16_t nvme_set_zd_ext(NvmeNamespace *n

[PATCH v8 08/11] hw/block/nvme: Introduce max active and open zone limits

2020-10-29 Thread Dmitry Fomichev
Add two module properties, "zoned.max_active" and "zoned.max_open"
to control the maximum number of zones that can be active or open.
Once these variables are set to non-default values, these limits are
checked during I/O and Too Many Active or Too Many Open command status
is returned if they are exceeded.

Signed-off-by: Hans Holmberg 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.c| 30 +-
 hw/block/nvme-ns.h| 41 +++
 hw/block/nvme.c   | 94 +++
 hw/block/trace-events |  2 +
 4 files changed, 165 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index e6db7f7d3b..2e45838c15 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -119,6 +119,20 @@ static int nvme_calc_zone_geometry(NvmeNamespace *ns, 
Error **errp)
 ns->zone_size_log2 = 63 - clz64(ns->zone_size);
 }
 
+/* Make sure that the values of all ZNS properties are sane */
+if (ns->params.max_open_zones > nz) {
+error_setg(errp,
+   "max_open_zones value %u exceeds the number of zones %u",
+   ns->params.max_open_zones, nz);
+return -1;
+}
+if (ns->params.max_active_zones > nz) {
+error_setg(errp,
+   "max_active_zones value %u exceeds the number of zones %u",
+   ns->params.max_active_zones, nz);
+return -1;
+}
+
 return 0;
 }
 
@@ -166,8 +180,8 @@ static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace 
*ns, int lba_index,
 id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
 
 /* MAR/MOR are zeroes-based, 0x means no limit */
-id_ns_z->mar = 0x;
-id_ns_z->mor = 0x;
+id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1);
+id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1);
 id_ns_z->zoc = 0;
 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
 
@@ -195,6 +209,7 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone 
*zone)
 trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
 nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
 }
+nvme_aor_inc_active(ns);
 QTAILQ_INSERT_HEAD(>closed_zones, zone, entry);
 } else {
 trace_pci_nvme_clear_ns_reset(state, zone->d.zslba);
@@ -211,16 +226,23 @@ static void nvme_zoned_ns_shutdown(NvmeNamespace *ns)
 
 QTAILQ_FOREACH_SAFE(zone, >closed_zones, entry, next) {
 QTAILQ_REMOVE(>closed_zones, zone, entry);
+nvme_aor_dec_active(ns);
 nvme_clear_zone(ns, zone);
 }
 QTAILQ_FOREACH_SAFE(zone, >imp_open_zones, entry, next) {
 QTAILQ_REMOVE(>imp_open_zones, zone, entry);
+nvme_aor_dec_open(ns);
+nvme_aor_dec_active(ns);
 nvme_clear_zone(ns, zone);
 }
 QTAILQ_FOREACH_SAFE(zone, >exp_open_zones, entry, next) {
 QTAILQ_REMOVE(>exp_open_zones, zone, entry);
+nvme_aor_dec_open(ns);
+nvme_aor_dec_active(ns);
 nvme_clear_zone(ns, zone);
 }
+
+assert(ns->nr_open_zones == 0);
 }
 
 static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
@@ -306,6 +328,10 @@ static Property nvme_ns_props[] = {
 DEFINE_PROP_SIZE("zoned.zcap", NvmeNamespace, params.zone_cap_bs, 0),
 DEFINE_PROP_BOOL("zoned.cross_read", NvmeNamespace,
  params.cross_zone_read, false),
+DEFINE_PROP_UINT32("zoned.max_active", NvmeNamespace,
+   params.max_active_zones, 0),
+DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace,
+   params.max_open_zones, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index d2631ff5a3..421bab0a57 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -33,6 +33,8 @@ typedef struct NvmeNamespaceParams {
 bool cross_zone_read;
 uint64_t zone_size_bs;
 uint64_t zone_cap_bs;
+uint32_t max_active_zones;
+uint32_t max_open_zones;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -56,6 +58,8 @@ typedef struct NvmeNamespace {
 uint64_tzone_capacity;
 uint64_tzone_array_size;
 uint32_tzone_size_log2;
+int32_t nr_open_zones;
+int32_t nr_active_zones;
 
 NvmeNamespaceParams params;
 } NvmeNamespace;
@@ -123,6 +127,43 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone)
st != NVME_ZONE_STATE_OFFLINE;
 }
 
+static inline void nvme_aor_inc_open(NvmeNamespace *ns)
+{
+assert(ns->nr_open_zones >= 0);
+if (ns->params.max_open_zones) {
+ns->nr_open_zones++;
+assert(ns->nr_open_zones <= ns->params.max_open_zones);
+}
+}
+
+static inline void nvme_aor_dec_open(NvmeNamespa

[PATCH v8 05/11] hw/block/nvme: Add support for Namespace Types

2020-10-29 Thread Dmitry Fomichev
From: Niklas Cassel 

Define the structures and constants required to implement
Namespace Types support.

Namespace Types introduce a new command set, "I/O Command Sets",
that allows the host to retrieve the command sets associated with
a namespace. Introduce support for the command set and enable
detection for the NVM Command Set.

The new workflows for identify commands rely heavily on zero-filled
identify structs. E.g., certain CNS commands are defined to return
a zero-filled identify struct when an inactive namespace NSID
is supplied.

Add a helper function in order to avoid code duplication when
reporting zero-filled identify structures.

Signed-off-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Keith Busch 
---
 hw/block/nvme-ns.c|   2 +
 hw/block/nvme-ns.h|   1 +
 hw/block/nvme.c   | 188 +++---
 hw/block/trace-events |   7 ++
 include/block/nvme.h  |  66 +++
 5 files changed, 219 insertions(+), 45 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index de735eb9f3..c0362426cc 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -41,6 +41,8 @@ static void nvme_ns_init(NvmeNamespace *ns)
 
 id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));
 
+ns->csi = NVME_CSI_NVM;
+
 /* no thin provisioning */
 id_ns->ncap = id_ns->nsze;
 id_ns->nuse = id_ns->ncap;
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index a38071884a..d795e44bab 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -31,6 +31,7 @@ typedef struct NvmeNamespace {
 int64_t  size;
 NvmeIdNs id_ns;
 const uint32_t *iocs;
+uint8_t  csi;
 
 NvmeNamespaceParams params;
 } NvmeNamespace;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 2a33540542..d9e9fd264c 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1336,7 +1336,7 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, 
uint32_t buf_len,
 DMA_DIRECTION_FROM_DEVICE, req);
 }
 
-static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t buf_len,
+static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
  uint64_t off, NvmeRequest *req)
 {
 NvmeEffectsLog log = {};
@@ -1351,8 +1351,15 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t 
buf_len,
 switch (NVME_CC_CSS(n->bar.cc)) {
 case NVME_CC_CSS_NVM:
 src_iocs = nvme_cse_iocs_nvm;
+/* fall through */
 case NVME_CC_CSS_ADMIN_ONLY:
 break;
+case NVME_CC_CSS_CSI:
+switch (csi) {
+case NVME_CSI_NVM:
+src_iocs = nvme_cse_iocs_nvm;
+break;
+}
 }
 
 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
@@ -1378,6 +1385,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest 
*req)
 uint8_t  lid = dw10 & 0xff;
 uint8_t  lsp = (dw10 >> 8) & 0xf;
 uint8_t  rae = (dw10 >> 15) & 0x1;
+uint8_t  csi = le32_to_cpu(cmd->cdw14) >> 24;
 uint32_t numdl, numdu;
 uint64_t off, lpol, lpou;
 size_t   len;
@@ -1411,7 +1419,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest 
*req)
 case NVME_LOG_FW_SLOT_INFO:
 return nvme_fw_log_info(n, len, off, req);
 case NVME_LOG_CMD_EFFECTS:
-return nvme_cmd_effects(n, len, off, req);
+return nvme_cmd_effects(n, csi, len, off, req);
 default:
 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
 return NVME_INVALID_FIELD | NVME_DNR;
@@ -1524,6 +1532,13 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest 
*req)
 return NVME_SUCCESS;
 }
 
+static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
+{
+uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
+
+return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req);
+}
+
 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
 {
 trace_pci_nvme_identify_ctrl();
@@ -1532,11 +1547,23 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, 
NvmeRequest *req)
 DMA_DIRECTION_FROM_DEVICE, req);
 }
 
+static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeIdentify *c = (NvmeIdentify *)>cmd;
+
+trace_pci_nvme_identify_ctrl_csi(c->csi);
+
+if (c->csi == NVME_CSI_NVM) {
+return nvme_rpt_empty_id_struct(n, req);
+}
+
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
-NvmeIdNs *id_ns, inactive = { 0 };
 uint32_t nsid = le32_to_cpu(c->nsid);
 
 trace_pci_nvme_identify_ns(nsid);
@@ -1547,23 +1574,46 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeRequest *req)
 
 ns = nvme_ns(n, nsid);
 if (unlikely(!ns)) {
-id_ns = 
-} else {
-id_ns = >id_ns;
+return nvme_rpt_empty_id_struct(n, req)

[PATCH v8 03/11] hw/block/nvme: Separate read and write handlers

2020-10-29 Thread Dmitry Fomichev
With ZNS support in place, the majority of code in nvme_rw() has
become read- or write-specific. Move these parts to two separate
handlers, nvme_read() and nvme_write() to make the code more
readable and to remove multiple is_write checks that so far existed
in the i/o path.

This is a refactoring patch, no change in functionality.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
Acked-by: Klaus Jensen 
---
 hw/block/nvme.c   | 91 ++-
 hw/block/trace-events |  3 +-
 2 files changed, 67 insertions(+), 27 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 1ec8ccb3e6..3023774484 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -963,6 +963,54 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
+static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
+NvmeNamespace *ns = req->ns;
+uint64_t slba = le64_to_cpu(rw->slba);
+uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
+uint64_t data_size = nvme_l2b(ns, nlb);
+uint64_t data_offset;
+BlockBackend *blk = ns->blkconf.blk;
+uint16_t status;
+
+trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, data_size, slba);
+
+status = nvme_check_mdts(n, data_size);
+if (status) {
+trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+goto invalid;
+}
+
+status = nvme_check_bounds(n, ns, slba, nlb);
+if (status) {
+trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
+goto invalid;
+}
+
+status = nvme_map_dptr(n, data_size, req);
+if (status) {
+goto invalid;
+}
+
+data_offset = nvme_l2b(ns, slba);
+
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_READ);
+if (req->qsg.sg) {
+req->aiocb = dma_blk_read(blk, >qsg, data_offset,
+  BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+} else {
+req->aiocb = blk_aio_preadv(blk, data_offset, >iov, 0,
+nvme_rw_cb, req);
+}
+return NVME_NO_COMPLETE;
+
+invalid:
+block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
+return status | NVME_DNR;
+}
+
 static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
@@ -988,22 +1036,19 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
-static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
 NvmeNamespace *ns = req->ns;
-uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
 uint64_t slba = le64_to_cpu(rw->slba);
-
+uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
 uint64_t data_size = nvme_l2b(ns, nlb);
-uint64_t data_offset = nvme_l2b(ns, slba);
-enum BlockAcctType acct = req->cmd.opcode == NVME_CMD_WRITE ?
-BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
+uint64_t data_offset;
 BlockBackend *blk = ns->blkconf.blk;
 uint16_t status;
 
-trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode),
-  nvme_nsid(ns), nlb, data_size, slba);
+trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
+ nvme_nsid(ns), nlb, data_size, slba);
 
 status = nvme_check_mdts(n, data_size);
 if (status) {
@@ -1022,29 +1067,22 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
 goto invalid;
 }
 
-block_acct_start(blk_get_stats(blk), >acct, data_size, acct);
+data_offset = nvme_l2b(ns, slba);
+
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_WRITE);
 if (req->qsg.sg) {
-if (acct == BLOCK_ACCT_WRITE) {
-req->aiocb = dma_blk_write(blk, >qsg, data_offset,
-   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
-} else {
-req->aiocb = dma_blk_read(blk, >qsg, data_offset,
-  BDRV_SECTOR_SIZE, nvme_rw_cb, req);
-}
+req->aiocb = dma_blk_write(blk, >qsg, data_offset,
+   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
 } else {
-if (acct == BLOCK_ACCT_WRITE) {
-req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
- nvme_rw_cb, req);
-} else {
-req->aiocb = blk_aio_preadv(blk, data_offset, >iov, 0,
-nvme_rw_cb, req);
-}
+req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
+ nvme_rw_cb, req);
 }
 return NVME_NO_COMPLETE;
 
 invalid:
-block_acct_invalid(blk_get_stats(ns->blkconf.blk), acct)

[PATCH v8 10/11] hw/block/nvme: Add injection of Offline/Read-Only zones

2020-10-29 Thread Dmitry Fomichev
ZNS specification defines two zone conditions for the zones that no
longer can function properly, possibly because of flash wear or other
internal fault. It is useful to be able to "inject" a small number of
such zones for testing purposes.

This commit defines two optional device properties, "offline_zones"
and "rdonly_zones". Users can assign non-zero values to these variables
to specify the number of zones to be initialized as Offline or
Read-Only. The actual number of injected zones may be smaller than the
requested amount - Read-Only and Offline counts are expected to be much
smaller than the total number of zones on a drive.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.c | 52 ++
 hw/block/nvme-ns.h |  2 ++
 2 files changed, 54 insertions(+)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 85dc73cf06..5e4a6705cd 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -21,6 +21,7 @@
 #include "sysemu/sysemu.h"
 #include "sysemu/block-backend.h"
 #include "qapi/error.h"
+#include "crypto/random.h"
 
 #include "hw/qdev-properties.h"
 #include "hw/qdev-core.h"
@@ -145,6 +146,20 @@ static int nvme_calc_zone_geometry(NvmeNamespace *ns, 
Error **errp)
 }
 }
 
+if (ns->params.max_open_zones < nz) {
+if (ns->params.nr_offline_zones > nz - ns->params.max_open_zones) {
+error_setg(errp, "offline_zones value %u is too large",
+ns->params.nr_offline_zones);
+return -1;
+}
+if (ns->params.nr_rdonly_zones >
+nz - ns->params.max_open_zones - ns->params.nr_offline_zones) {
+error_setg(errp, "rdonly_zones value %u is too large",
+ns->params.nr_rdonly_zones);
+return -1;
+}
+}
+
 return 0;
 }
 
@@ -153,7 +168,9 @@ static void nvme_init_zone_state(NvmeNamespace *ns)
 uint64_t start = 0, zone_size = ns->zone_size;
 uint64_t capacity = ns->num_zones * zone_size;
 NvmeZone *zone;
+uint32_t rnd;
 int i;
+uint16_t zs;
 
 ns->zone_array = g_malloc0(ns->zone_array_size);
 if (ns->params.zd_extension_size) {
@@ -180,6 +197,37 @@ static void nvme_init_zone_state(NvmeNamespace *ns)
 zone->w_ptr = start;
 start += zone_size;
 }
+
+/* If required, make some zones Offline or Read Only */
+
+for (i = 0; i < ns->params.nr_offline_zones; i++) {
+do {
+qcrypto_random_bytes(, sizeof(rnd), NULL);
+rnd %= ns->num_zones;
+} while (rnd < ns->params.max_open_zones);
+zone = >zone_array[rnd];
+zs = nvme_get_zone_state(zone);
+if (zs != NVME_ZONE_STATE_OFFLINE) {
+nvme_set_zone_state(zone, NVME_ZONE_STATE_OFFLINE);
+} else {
+i--;
+}
+}
+
+for (i = 0; i < ns->params.nr_rdonly_zones; i++) {
+do {
+qcrypto_random_bytes(, sizeof(rnd), NULL);
+rnd %= ns->num_zones;
+} while (rnd < ns->params.max_open_zones);
+zone = >zone_array[rnd];
+zs = nvme_get_zone_state(zone);
+if (zs != NVME_ZONE_STATE_OFFLINE &&
+zs != NVME_ZONE_STATE_READ_ONLY) {
+nvme_set_zone_state(zone, NVME_ZONE_STATE_READ_ONLY);
+} else {
+i--;
+}
+}
 }
 
 static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace *ns, int lba_index,
@@ -353,6 +401,10 @@ static Property nvme_ns_props[] = {
params.max_open_zones, 0),
 DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
params.zd_extension_size, 0),
+DEFINE_PROP_UINT32("zoned.offline_zones", NvmeNamespace,
+   params.nr_offline_zones, 0),
+DEFINE_PROP_UINT32("zoned.rdonly_zones", NvmeNamespace,
+   params.nr_rdonly_zones, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 50a6a0e1ac..b30478e5d7 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -36,6 +36,8 @@ typedef struct NvmeNamespaceParams {
 uint32_t max_active_zones;
 uint32_t max_open_zones;
 uint32_t zd_extension_size;
+uint32_t nr_offline_zones;
+uint32_t nr_rdonly_zones;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
-- 
2.21.0




[PATCH v8 01/11] hw/block/nvme: Add Commands Supported and Effects log

2020-10-29 Thread Dmitry Fomichev
This log page becomes necessary to implement to allow checking for
Zone Append command support in Zoned Namespace Command Set.

This commit adds the code to report this log page for NVM Command
Set only. The parts that are specific to zoned operation will be
added later in the series.

All incoming admin and i/o commands are now only processed if their
corresponding support bits are set in this log. This provides an
easy way to control what commands to support and what not to
depending on set CC.CSS.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h|  1 +
 hw/block/nvme.c   | 96 +++
 hw/block/trace-events |  1 +
 include/block/nvme.h  | 19 +
 4 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 83734f4606..ea8c2f785d 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -29,6 +29,7 @@ typedef struct NvmeNamespace {
 int32_t  bootindex;
 int64_t  size;
 NvmeIdNs id_ns;
+const uint32_t *iocs;
 
 NvmeNamespaceParams params;
 } NvmeNamespace;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 9d30ca69dc..9fed061a9d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -111,6 +111,28 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE,
 };
 
+static const uint32_t nvme_cse_acs[256] = {
+[NVME_ADM_CMD_DELETE_SQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_CREATE_SQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_DELETE_CQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_CREATE_CQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_ABORT]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
+};
+
+static const uint32_t nvme_cse_iocs_none[256];
+
+static const uint32_t nvme_cse_iocs_nvm[256] = {
+[NVME_CMD_FLUSH]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_WRITE]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
+};
+
 static void nvme_process_sq(void *opaque);
 
 static uint16_t nvme_cid(NvmeRequest *req)
@@ -1032,10 +1054,6 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
   req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
 
-if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_ADMIN_ONLY) {
-return NVME_INVALID_OPCODE | NVME_DNR;
-}
-
 if (!nvme_nsid_valid(n, nsid)) {
 return NVME_INVALID_NSID | NVME_DNR;
 }
@@ -1045,6 +1063,11 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
+if (!(req->ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
+trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
+return NVME_INVALID_OPCODE | NVME_DNR;
+}
+
 switch (req->cmd.opcode) {
 case NVME_CMD_FLUSH:
 return nvme_flush(n, req);
@@ -1054,8 +1077,7 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
 case NVME_CMD_READ:
 return nvme_rw(n, req);
 default:
-trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
-return NVME_INVALID_OPCODE | NVME_DNR;
+assert(false);
 }
 }
 
@@ -1291,6 +1313,37 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t 
rae, uint32_t buf_len,
 DMA_DIRECTION_FROM_DEVICE, req);
 }
 
+static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t buf_len,
+ uint64_t off, NvmeRequest *req)
+{
+NvmeEffectsLog log = {};
+const uint32_t *src_iocs = NULL;
+uint32_t trans_len;
+
+if (off >= sizeof(log)) {
+trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+switch (NVME_CC_CSS(n->bar.cc)) {
+case NVME_CC_CSS_NVM:
+src_iocs = nvme_cse_iocs_nvm;
+case NVME_CC_CSS_ADMIN_ONLY:
+break;
+}
+
+memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
+
+if (src_iocs) {
+memcpy(log.iocs, src_iocs, sizeof(log.iocs));
+}
+
+trans_len = MIN(sizeof(log) - off, buf_len);
+
+return nvme_dma(n, ((uint8_t *)) + off, trans_len,
+DMA_DIRECTION_FROM_DEVICE, req);
+}
+
 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeCmd *cmd = >cmd;
@@ -1334,6 +1387,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_smart_info(n, rae, len, off, req);
 cas

[PATCH v8 07/11] hw/block/nvme: Support Zoned Namespace Command Set

2020-10-29 Thread Dmitry Fomichev
The emulation code has been changed to advertise NVM Command Set when
"zoned" device property is not set (default) and Zoned Namespace
Command Set otherwise.

Define values and structures that are needed to support Zoned
Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator.
Define trace events where needed in newly introduced code.

In order to improve scalability, all open, closed and full zones
are organized in separate linked lists. Consequently, almost all
zone operations don't require scanning of the entire zone array
(which potentially can be quite large) - it is only necessary to
enumerate one or more zone lists.

Handlers for three new NVMe commands introduced in Zoned Namespace
Command Set specification are added, namely for Zone Management
Receive, Zone Management Send and Zone Append.

Device initialization code has been extended to create a proper
configuration for zoned operation using device properties.

Read/Write command handler is modified to only allow writes at the
write pointer if the namespace is zoned. For Zone Append command,
writes implicitly happen at the write pointer and the starting write
pointer value is returned as the result of the command. Write Zeroes
handler is modified to add zoned checks that are identical to those
done as a part of Write flow.

Subsequent commits in this series add ZDE support and checks for
active and open zone limits.

Signed-off-by: Niklas Cassel 
Signed-off-by: Hans Holmberg 
Signed-off-by: Ajay Joshi 
Signed-off-by: Chaitanya Kulkarni 
Signed-off-by: Matias Bjorling 
Signed-off-by: Aravind Ramesh 
Signed-off-by: Shin'ichiro Kawasaki 
Signed-off-by: Adam Manzanares 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 block/nvme.c  |   2 +-
 hw/block/nvme-ns.c| 173 
 hw/block/nvme-ns.h|  54 +++
 hw/block/nvme.c   | 977 +-
 hw/block/nvme.h   |   8 +
 hw/block/trace-events |  18 +-
 include/block/nvme.h  | 113 -
 7 files changed, 1322 insertions(+), 23 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 05485fdd11..7a513c9a17 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -333,7 +333,7 @@ static inline int nvme_translate_error(const NvmeCqe *c)
 {
 uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
 if (status) {
-trace_nvme_error(le32_to_cpu(c->result),
+trace_nvme_error(le32_to_cpu(c->result32),
  le16_to_cpu(c->sq_head),
  le16_to_cpu(c->sq_id),
  le16_to_cpu(c->cid),
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index e191ef9be0..e6db7f7d3b 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -25,6 +25,7 @@
 #include "hw/qdev-properties.h"
 #include "hw/qdev-core.h"
 
+#include "trace.h"
 #include "nvme.h"
 #include "nvme-ns.h"
 
@@ -77,6 +78,151 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, 
Error **errp)
 return 0;
 }
 
+static int nvme_calc_zone_geometry(NvmeNamespace *ns, Error **errp)
+{
+uint64_t zone_size, zone_cap;
+uint32_t nz, lbasz = ns->blkconf.logical_block_size;
+
+if (ns->params.zone_size_bs) {
+zone_size = ns->params.zone_size_bs;
+} else {
+zone_size = NVME_DEFAULT_ZONE_SIZE;
+}
+if (ns->params.zone_cap_bs) {
+zone_cap = ns->params.zone_cap_bs;
+} else {
+zone_cap = zone_size;
+}
+if (zone_cap > zone_size) {
+error_setg(errp, "zone capacity %luB exceeds zone size %luB",
+   zone_cap, zone_size);
+return -1;
+}
+if (zone_size < lbasz) {
+error_setg(errp, "zone size %luB too small, must be at least %uB",
+   zone_size, lbasz);
+return -1;
+}
+if (zone_cap < lbasz) {
+error_setg(errp, "zone capacity %luB too small, must be at least %uB",
+   zone_cap, lbasz);
+return -1;
+}
+ns->zone_size = zone_size / lbasz;
+ns->zone_capacity = zone_cap / lbasz;
+
+nz = DIV_ROUND_UP(ns->size / lbasz, ns->zone_size);
+ns->num_zones = nz;
+ns->zone_array_size = sizeof(NvmeZone) * nz;
+ns->zone_size_log2 = 0;
+if (is_power_of_2(ns->zone_size)) {
+ns->zone_size_log2 = 63 - clz64(ns->zone_size);
+}
+
+return 0;
+}
+
+static void nvme_init_zone_state(NvmeNamespace *ns)
+{
+uint64_t start = 0, zone_size = ns->zone_size;
+uint64_t capacity = ns->num_zones * zone_size;
+NvmeZone *zone;
+int i;
+
+ns->zone_array = g_malloc0(ns->zone_array_size);
+
+QTAILQ_INIT(>exp_open_zones);
+QTAILQ_INIT(>imp_open_zones);
+QTAILQ_INIT(>closed_zones);
+QTAILQ_INIT(>full_zones);
+
+zone = ns->zone_array;
+for (i = 0; i < ns->num_zones; i++, zone++) 

[PATCH v8 02/11] hw/block/nvme: Generate namespace UUIDs

2020-10-29 Thread Dmitry Fomichev
In NVMe 1.4, a namespace must report an ID descriptor of UUID type
if it doesn't support EUI64 or NGUID. Add a new namespace property,
"uuid", that provides the user the option to either specify the UUID
explicitly or have a UUID generated automatically every time a
namespace is initialized.

Suggested-by: Klaus Jensen 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Klaus Jensen 
Reviewed-by: Keith Busch 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.c | 1 +
 hw/block/nvme-ns.h | 1 +
 hw/block/nvme.c| 9 +
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index b69cdaf27e..de735eb9f3 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -129,6 +129,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
 static Property nvme_ns_props[] = {
 DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
 DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
+DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index ea8c2f785d..a38071884a 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -21,6 +21,7 @@
 
 typedef struct NvmeNamespaceParams {
 uint32_t nsid;
+QemuUUID uuid;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 9fed061a9d..1ec8ccb3e6 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1572,6 +1572,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 
 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
 {
+NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
 uint32_t nsid = le32_to_cpu(c->nsid);
 uint8_t list[NVME_IDENTIFY_DATA_SIZE];
@@ -1591,7 +1592,8 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_NSID | NVME_DNR;
 }
 
-if (unlikely(!nvme_ns(n, nsid))) {
+ns = nvme_ns(n, nsid);
+if (unlikely(!ns)) {
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
@@ -1600,12 +1602,11 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl 
*n, NvmeRequest *req)
 /*
  * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data
  * structure, a Namespace UUID (nidt = 0x3) must be reported in the
- * Namespace Identification Descriptor. Add a very basic Namespace UUID
- * here.
+ * Namespace Identification Descriptor. Add the namespace UUID here.
  */
 ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID;
 ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN;
-stl_be_p(_descrs->uuid.v, nsid);
+memcpy(_descrs->uuid.v, ns->params.uuid.data, NVME_NIDT_UUID_LEN);
 
 return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE,
 DMA_DIRECTION_FROM_DEVICE, req);
-- 
2.21.0




[PATCH v8 04/11] hw/block/nvme: Merge nvme_write_zeroes() with nvme_write()

2020-10-29 Thread Dmitry Fomichev
nvme_write() now handles WRITE, WRITE ZEROES and ZONE_APPEND.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
Acked-by: Klaus Jensen 
---
 hw/block/nvme.c   | 72 +--
 hw/block/trace-events |  1 -
 2 files changed, 28 insertions(+), 45 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 3023774484..2a33540542 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1011,32 +1011,7 @@ invalid:
 return status | NVME_DNR;
 }
 
-static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
-{
-NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
-NvmeNamespace *ns = req->ns;
-uint64_t slba = le64_to_cpu(rw->slba);
-uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
-uint64_t offset = nvme_l2b(ns, slba);
-uint32_t count = nvme_l2b(ns, nlb);
-uint16_t status;
-
-trace_pci_nvme_write_zeroes(nvme_cid(req), nvme_nsid(ns), slba, nlb);
-
-status = nvme_check_bounds(n, ns, slba, nlb);
-if (status) {
-trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
-return status;
-}
-
-block_acct_start(blk_get_stats(req->ns->blkconf.blk), >acct, 0,
- BLOCK_ACCT_WRITE);
-req->aiocb = blk_aio_pwrite_zeroes(req->ns->blkconf.blk, offset, count,
-   BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
-return NVME_NO_COMPLETE;
-}
-
-static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req, bool wrz)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
 NvmeNamespace *ns = req->ns;
@@ -1050,10 +1025,12 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
  nvme_nsid(ns), nlb, data_size, slba);
 
-status = nvme_check_mdts(n, data_size);
-if (status) {
-trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
-goto invalid;
+if (!wrz) {
+status = nvme_check_mdts(n, data_size);
+if (status) {
+trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+goto invalid;
+}
 }
 
 status = nvme_check_bounds(n, ns, slba, nlb);
@@ -1062,21 +1039,28 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest 
*req)
 goto invalid;
 }
 
-status = nvme_map_dptr(n, data_size, req);
-if (status) {
-goto invalid;
-}
-
 data_offset = nvme_l2b(ns, slba);
 
-block_acct_start(blk_get_stats(blk), >acct, data_size,
- BLOCK_ACCT_WRITE);
-if (req->qsg.sg) {
-req->aiocb = dma_blk_write(blk, >qsg, data_offset,
-   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+if (!wrz) {
+status = nvme_map_dptr(n, data_size, req);
+if (status) {
+goto invalid;
+}
+
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_WRITE);
+if (req->qsg.sg) {
+req->aiocb = dma_blk_write(blk, >qsg, data_offset,
+   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+} else {
+req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
+ nvme_rw_cb, req);
+}
 } else {
-req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
- nvme_rw_cb, req);
+block_acct_start(blk_get_stats(blk), >acct, 0, BLOCK_ACCT_WRITE);
+req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
+   BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
+   req);
 }
 return NVME_NO_COMPLETE;
 
@@ -1110,9 +1094,9 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
 case NVME_CMD_FLUSH:
 return nvme_flush(n, req);
 case NVME_CMD_WRITE_ZEROES:
-return nvme_write_zeroes(n, req);
+return nvme_write(n, req, true);
 case NVME_CMD_WRITE:
-return nvme_write(n, req);
+return nvme_write(n, req, false);
 case NVME_CMD_READ:
 return nvme_read(n, req);
 default:
diff --git a/hw/block/trace-events b/hw/block/trace-events
index d81b1891d8..658633177d 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -43,7 +43,6 @@ pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t 
opcode, const char *opna
 pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, 
uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 
0x%"PRIx64""
 pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, 
uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb 
%"PRIu32" count %"PRIu64" lba 0x%"P

[PATCH v8 00/11] hw/block/nvme: Support Namespace Types and Zoned Namespace Command Set

2020-10-29 Thread Dmitry Fomichev
s and annotations
* Rename ZAMDS to ZASL
* Correct a few format expressions and some wording in
  trace event definitions
* Remove validation code to return NVME_CAP_EXCEEDED error
* Make ZASL to be equal to MDTS if "zone_append_size_limit"
  module parameter is not set
* Clean up nvme_zoned_init_ctrl() to make size calculations
  less confusing
* Avoid changing module parameters, use separate n/s variables
  if additional calculations are necessary to convert parameters
  to running values
* Use NVME_DEFAULT_ZONE_SIZE to assign the default zone size value
* Use default 0 for zone capacity meaning that zone capacity will
  be equal to zone size by default
* Issue warnings if user MAR/MOR values are too large and have
  to be adjusted
* Use unsigned values for MAR/MOR
 - Dropped "Simulate Zone Active excursions" patch.
   Excursion behavior may depend on the internal controller
   architecture and therefore be vendor-specific.
 - Dropped support for Zone Attributes and zoned AENs for now.
   These features can be added in a future series.
 - NS Types support is extended to handle active/inactive namespaces.
 - Update the write pointer after backing storage I/O completion, not
   before. This makes the emulation to run correctly in case of
   backing device failures.
 - Avoid division in the I/O path if the device zone size is
   a power of two (the most common case). Zone index then can be
   calculated by using bit shift.
 - A few reported bugs have been fixed.
 - Indentation in function definitions has been changed to make it
   the same as the rest of the code.

Zoned Namespace (ZNS) Command Set is a newly introduced command set
published by the NVM Express, Inc. organization as TP 4053. The main
design goals of ZNS are to provide hardware designers the means to
reduce NVMe controller complexity and to allow achieving a better I/O
latency and throughput. SSDs that implement this interface are
commonly known as ZNS SSDs.

This command set is implementing a zoned storage model, similarly to
ZAC/ZBC. As such, there is already support in Linux, allowing one to
perform the majority of tasks needed for managing ZNS SSDs.

The Zoned Namespace Command Set relies on another TP, known as
Namespace Types (NVMe TP 4056), which introduces support for having
multiple command sets per namespace.

Both ZNS and Namespace Types specifications can be downloaded by
visiting the following link -

https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs.zip

This patch series adds Namespace Types support and zoned namespace
emulation capability to the existing NVMe PCI device.


Based-on: <20201013174826.ga1049...@dhcp-10-100-145-180.wdl.wdc.com>

Dmitry Fomichev (9):
  hw/block/nvme: Add Commands Supported and Effects log
  hw/block/nvme: Generate namespace UUIDs
  hw/block/nvme: Separate read and write handlers
  hw/block/nvme: Merge nvme_write_zeroes() with nvme_write()
  hw/block/nvme: Support Zoned Namespace Command Set
  hw/block/nvme: Introduce max active and open zone limits
  hw/block/nvme: Support Zone Descriptor Extensions
  hw/block/nvme: Add injection of Offline/Read-Only zones
  hw/block/nvme: Document zoned parameters in usage text

Niklas Cassel (2):
  hw/block/nvme: Add support for Namespace Types
  hw/block/nvme: Support allocated CNS command variants

 block/nvme.c  |2 +-
 hw/block/nvme-ns.c|  276 +++
 hw/block/nvme-ns.h|  109 +++
 hw/block/nvme.c   | 1615 ++---
 hw/block/nvme.h   |8 +
 hw/block/trace-events |   32 +-
 include/block/nvme.h  |  204 +-
 7 files changed, 2113 insertions(+), 133 deletions(-)

-- 
2.21.0




RE: [PATCH v7 05/11] hw/block/nvme: Support Zoned Namespace Command Set

2020-10-21 Thread Dmitry Fomichev
> -Original Message-
> From: Klaus Jensen 
> Sent: Wednesday, October 21, 2020 6:26 AM
> To: Dmitry Fomichev 
> Cc: Keith Busch ; Klaus Jensen
> ; Kevin Wolf ; Philippe
> Mathieu-Daudé ; Maxim Levitsky
> ; Fam Zheng ; Niklas Cassel
> ; Damien Le Moal ;
> qemu-bl...@nongnu.org; qemu-devel@nongnu.org; Alistair Francis
> ; Matias Bjorling 
> Subject: Re: [PATCH v7 05/11] hw/block/nvme: Support Zoned Namespace
> Command Set
> 
> On Oct 19 11:17, Dmitry Fomichev wrote:
> > +/*
> > + * Close or finish all the zones that are currently open.
> > + */
> > +static void nvme_zoned_clear_ns(NvmeNamespace *ns)
> > +{
> > +NvmeZone *zone;
> > +uint32_t set_state;
> > +int i;
> > +
> > +zone = ns->zone_array;
> > +for (i = 0; i < ns->num_zones; i++, zone++) {
> > +switch (nvme_get_zone_state(zone)) {
> > +case NVME_ZONE_STATE_IMPLICITLY_OPEN:
> > +QTAILQ_REMOVE(>imp_open_zones, zone, entry);
> > +break;
> > +case NVME_ZONE_STATE_EXPLICITLY_OPEN:
> > +QTAILQ_REMOVE(>exp_open_zones, zone, entry);
> > +break;
> > +case NVME_ZONE_STATE_CLOSED:
> > +/* fall through */
> > +default:
> > +continue;
> > +}
> > +
> > +if (zone->d.wp == zone->d.zslba) {
> > +set_state = NVME_ZONE_STATE_EMPTY;
> > +} else {
> > +set_state = NVME_ZONE_STATE_CLOSED;
> > +}
> > +
> > +switch (set_state) {
> > +case NVME_ZONE_STATE_CLOSED:
> > +trace_pci_nvme_clear_ns_close(nvme_get_zone_state(zone),
> > +  zone->d.zslba);
> > +QTAILQ_INSERT_TAIL(>closed_zones, zone, entry);
> > +break;
> > +case NVME_ZONE_STATE_EMPTY:
> > +trace_pci_nvme_clear_ns_reset(nvme_get_zone_state(zone),
> > +  zone->d.zslba);
> > +break;
> > +case NVME_ZONE_STATE_FULL:
> > +trace_pci_nvme_clear_ns_full(nvme_get_zone_state(zone),
> > + zone->d.zslba);
> > +zone->d.wp = nvme_zone_wr_boundary(zone);
> > +QTAILQ_INSERT_TAIL(>full_zones, zone, entry);
> > +}
> 
> No need for the switch here - just add to the closed list in the
> conditional.

The switch becomes handy later in the series, particularly after adding
descriptor extensions. For easier reviewing, it makes sense to add it from
the beginning even though it is rudimentary at this point.

> 
> The NVME_ZONE_STATE_FULL case is unreachable.

Indeed. This should be introduced in the next patch.

Now, I've looked at this code again and the active/open counting in this
function ends up to be not quite right, I am fixing it.

> 
> > +
> > +zone->w_ptr = zone->d.wp;
> > +nvme_set_zone_state(zone, set_state);
> > +}
> > +}


RE: [PATCH v7 03/11] hw/block/nvme: Add support for Namespace Types

2020-10-20 Thread Dmitry Fomichev
> -Original Message-
> From: Klaus Jensen 
> Sent: Monday, October 19, 2020 4:54 PM
> To: Dmitry Fomichev 
> Cc: Keith Busch ; Klaus Jensen
> ; Kevin Wolf ; Philippe
> Mathieu-Daudé ; Maxim Levitsky
> ; Fam Zheng ; Niklas Cassel
> ; Damien Le Moal ;
> qemu-bl...@nongnu.org; qemu-devel@nongnu.org; Alistair Francis
> ; Matias Bjorling 
> Subject: Re: [PATCH v7 03/11] hw/block/nvme: Add support for Namespace
> Types
> 
> On Oct 19 11:17, Dmitry Fomichev wrote:
> > From: Niklas Cassel 
> >
> > Define the structures and constants required to implement
> > Namespace Types support.
> >
> > Namespace Types introduce a new command set, "I/O Command Sets",
> > that allows the host to retrieve the command sets associated with
> > a namespace. Introduce support for the command set and enable
> > detection for the NVM Command Set.
> >
> > The new workflows for identify commands rely heavily on zero-filled
> > identify structs. E.g., certain CNS commands are defined to return
> > a zero-filled identify struct when an inactive namespace NSID
> > is supplied.
> >
> > Add a helper function in order to avoid code duplication when
> > reporting zero-filled identify structures.
> >
> > Signed-off-by: Niklas Cassel 
> > Signed-off-by: Dmitry Fomichev 
> > ---
> >  hw/block/nvme-ns.c|   2 +
> >  hw/block/nvme-ns.h|   1 +
> >  hw/block/nvme.c   | 169 +++---
> >  hw/block/trace-events |   7 ++
> >  include/block/nvme.h  |  65 
> >  5 files changed, 202 insertions(+), 42 deletions(-)
> >
> > diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> > index de735eb9f3..c0362426cc 100644
> > --- a/hw/block/nvme-ns.c
> > +++ b/hw/block/nvme-ns.c
> > @@ -41,6 +41,8 @@ static void nvme_ns_init(NvmeNamespace *ns)
> >
> >  id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));
> >
> > +ns->csi = NVME_CSI_NVM;
> > +
> >  /* no thin provisioning */
> >  id_ns->ncap = id_ns->nsze;
> >  id_ns->nuse = id_ns->ncap;
> > diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> > index a38071884a..d795e44bab 100644
> > --- a/hw/block/nvme-ns.h
> > +++ b/hw/block/nvme-ns.h
> > @@ -31,6 +31,7 @@ typedef struct NvmeNamespace {
> >  int64_t  size;
> >  NvmeIdNs id_ns;
> >  const uint32_t *iocs;
> > +uint8_t  csi;
> >
> >  NvmeNamespaceParams params;
> >  } NvmeNamespace;
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 29139d8a17..ca0d0abf5c 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -1503,6 +1503,13 @@ static uint16_t nvme_create_cq(NvmeCtrl *n,
> NvmeRequest *req)
> >  return NVME_SUCCESS;
> >  }
> >
> > +static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest
> *req)
> > +{
> > +uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
> 
> [-pedantic] empty initializer list
> 
> > +
> > +return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE,
> req);
> > +}
> > +
> >  static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
> >  {
> >  trace_pci_nvme_identify_ctrl();
> > @@ -1511,11 +1518,23 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl
> *n, NvmeRequest *req)
> >  DMA_DIRECTION_FROM_DEVICE, req);
> >  }
> >
> > +static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
> > +{
> > +NvmeIdentify *c = (NvmeIdentify *)>cmd;
> > +
> > +trace_pci_nvme_identify_ctrl_csi(c->csi);
> > +
> > +if (c->csi == NVME_CSI_NVM) {
> > +return nvme_rpt_empty_id_struct(n, req);
> > +}
> > +
> > +return NVME_INVALID_FIELD | NVME_DNR;
> > +}
> > +
> >  static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
> >  {
> >  NvmeNamespace *ns;
> >  NvmeIdentify *c = (NvmeIdentify *)>cmd;
> > -NvmeIdNs *id_ns, inactive = { 0 };
> >  uint32_t nsid = le32_to_cpu(c->nsid);
> >
> >  trace_pci_nvme_identify_ns(nsid);
> > @@ -1526,23 +1545,46 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n,
> NvmeRequest *req)
> >
> >  ns = nvme_ns(n, nsid);
> >  if (unlikely(!ns)) {
> > -id_ns = 
> > -} else {
> > -id_ns = >id_ns;
> > +return nvme_rpt_empty_id_struct(n, req);
> >  }
> >
> > -return nvme_dma(n, (uint8_t *)id_ns, sizeof(NvmeIdNs),
> &

RE: [PATCH v7 04/11] hw/block/nvme: Support allocated CNS command variants

2020-10-20 Thread Dmitry Fomichev
> -Original Message-
> From: Klaus Jensen 
> Sent: Tuesday, October 20, 2020 4:21 AM
> To: Dmitry Fomichev 
> Cc: Keith Busch ; Klaus Jensen
> ; Kevin Wolf ; Philippe
> Mathieu-Daudé ; Maxim Levitsky
> ; Fam Zheng ; Niklas Cassel
> ; Damien Le Moal ;
> qemu-bl...@nongnu.org; qemu-devel@nongnu.org; Alistair Francis
> ; Matias Bjorling 
> Subject: Re: [PATCH v7 04/11] hw/block/nvme: Support allocated CNS
> command variants
> 
> On Oct 19 11:17, Dmitry Fomichev wrote:
> 
> (snip)
> 
> > CAP.CSS (together with the I/O Command Set data structure) defines
> > what command sets are supported by the controller.
> >
> > CC.CSS (together with Set Profile) can be set to enable a subset of
> > the available command sets.
> >
> > Even if a user configures CC.CSS to e.g. Admin only, NVM namespaces
> > will still be attached (and thus marked as active).
> > Similarly, if a user configures CC.CSS to e.g. NVM, ZNS namespaces
> > will still be attached (and thus marked as active).
> >
> > However, any operation from a disabled command set will result in a
> > Invalid Command Opcode.
> >
> 
> This part of the commit message seems irrelevant to the patch.
> 
> > Add a new Boolean namespace property, "attached", to provide the most
> > basic namespace attachment support. The default value for this new
> > property is true. Also, implement the logic in the new CNS values to
> > include/exclude namespaces based on this new property. The only thing
> > missing is hooking up the actual Namespace Attachment command opcode,
> > which will allow a user to toggle the "attached" flag per namespace.
> >
> 
> Without Namespace Attachment support, the sole purpose of this
> parameter
> is to allow unusable namespace IDs to be reported. I have no problems
> with adding support for the additional CNS values. They will return
> identical responses, but I think that is good enough for now.
> 
> When it is not really needed, we should be wary of adding a parameter
> that is really hard to get rid of again.
> 
> > The reason for not hooking up this command completely is because the
> > NVMe specification requires the namespace management command to be
> > supported if the namespace attachment command is supported.
> >
> 
> There are many ways to support Namespace Management, and there are a
> lot
> of quirks with each of them. Do we use a big blockdev and carve out
> namespaces? Then, what are the semantics of an image resize operation?
> 
> Do we dynamically create blockdev devices - thats sounds pretty nice,
> but might have other quirks and the attachment is not really persistent.
> 
> I think at least the "attached" parameter should be x-prefixed, but
> better, leave it out for now until we know how we want Namespace
> Attachment and Management to be implemented.

I don't mind leaving this property out. I used it for testing the patch and it
could, in theory, be manipulated by an external process doing NS
Management, but, as you said, there is no certainty about now NS
Management will be implemented and any related CLI interface should
better be added as a part of this future work, not now.




RE: [PATCH v7 10/11] hw/block/nvme: Separate read and write handlers

2020-10-20 Thread Dmitry Fomichev
> -Original Message-
> From: Keith Busch 
> Sent: Tuesday, October 20, 2020 8:36 AM
> To: Klaus Jensen 
> Cc: Dmitry Fomichev ; Klaus Jensen
> ; Kevin Wolf ; Philippe
> Mathieu-Daudé ; Maxim Levitsky
> ; Fam Zheng ; Niklas Cassel
> ; Damien Le Moal ;
> qemu-bl...@nongnu.org; qemu-devel@nongnu.org; Alistair Francis
> ; Matias Bjorling 
> Subject: Re: [PATCH v7 10/11] hw/block/nvme: Separate read and write
> handlers
> 
> On Tue, Oct 20, 2020 at 10:28:22AM +0200, Klaus Jensen wrote:
> > On Oct 19 11:17, Dmitry Fomichev wrote:
> > > With ZNS support in place, the majority of code in nvme_rw() has
> > > become read- or write-specific. Move these parts to two separate
> > > handlers, nvme_read() and nvme_write() to make the code more
> > > readable and to remove multiple is_write checks that so far existed
> > > in the i/o path.
> > >
> > > This is a refactoring patch, no change in functionality.
> > >
> >
> > This makes a lot of sense, totally Acked, but it might be better to move
> > it ahead as a preparation patch? It would make the zoned patch easier on
> > the eye.
> 
> I agree with the suggestion.

Ok, will move them to the front of the series.



RE: [PATCH v7 01/11] hw/block/nvme: Add Commands Supported and Effects log

2020-10-20 Thread Dmitry Fomichev
> -Original Message-
> From: Klaus Jensen 
> Sent: Monday, October 19, 2020 4:16 PM
> To: Dmitry Fomichev 
> Cc: Keith Busch ; Klaus Jensen
> ; Kevin Wolf ; Philippe
> Mathieu-Daudé ; Maxim Levitsky
> ; Fam Zheng ; Niklas Cassel
> ; Damien Le Moal ;
> qemu-bl...@nongnu.org; qemu-devel@nongnu.org; Alistair Francis
> ; Matias Bjorling 
> Subject: Re: [PATCH v7 01/11] hw/block/nvme: Add Commands Supported
> and Effects log
> 
> On Oct 19 11:17, Dmitry Fomichev wrote:
> > This log page becomes necessary to implement to allow checking for
> > Zone Append command support in Zoned Namespace Command Set.
> >
> > This commit adds the code to report this log page for NVM Command
> > Set only. The parts that are specific to zoned operation will be
> > added later in the series.
> >
> > All incoming admin and i/o commands are now only processed if their
> > corresponding support bits are set in this log. This provides an
> > easy way to control what commands to support and what not to
> > depending on set CC.CSS.
> >
> > Signed-off-by: Dmitry Fomichev 
> > ---
> >  hw/block/nvme-ns.h|  1 +
> >  hw/block/nvme.c   | 98 +++--
> --
> >  hw/block/trace-events |  2 +
> >  include/block/nvme.h  | 19 +
> >  4 files changed, 111 insertions(+), 9 deletions(-)
> >
> > diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> > index 83734f4606..ea8c2f785d 100644
> > --- a/hw/block/nvme-ns.h
> > +++ b/hw/block/nvme-ns.h
> > @@ -29,6 +29,7 @@ typedef struct NvmeNamespace {
> >  int32_t  bootindex;
> >  int64_t  size;
> >  NvmeIdNs id_ns;
> > +const uint32_t *iocs;
> >
> >  NvmeNamespaceParams params;
> >  } NvmeNamespace;
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 9d30ca69dc..5a9493d89f 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -111,6 +111,28 @@ static const uint32_t
> nvme_feature_cap[NVME_FID_MAX] = {
> >  [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE,
> >  };
> >
> > +static const uint32_t nvme_cse_acs[256] = {
> > +[NVME_ADM_CMD_DELETE_SQ]= NVME_CMD_EFF_CSUPP,
> > +[NVME_ADM_CMD_CREATE_SQ]= NVME_CMD_EFF_CSUPP,
> > +[NVME_ADM_CMD_DELETE_CQ]= NVME_CMD_EFF_CSUPP,
> > +[NVME_ADM_CMD_CREATE_CQ]= NVME_CMD_EFF_CSUPP,
> > +[NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
> > +[NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
> > +[NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
> > +[NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
> > +[NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
> > +};
> 
> NVME_ADM_CMD_ABORT is missing. And since you added a (redundant)
> check
> in nvme_admin_cmd that cheks this table, Abort is now an invalid
> command.

Adding the ABORT, thanks. I think this code was conceived before abort was
merged, this is why it is missing.

> 
> Also, can you reorder it according to opcode instead of
> pseudo-lexicographically?

Ok, will move ...GET_LOG_PAGE  that is now out or order.

> 
> > +
> > +static const uint32_t nvme_cse_iocs_none[256] = {
> > +};
> 
> [-pedantic] no need for the '= {}'

OK.

> 
> > +
> > +static const uint32_t nvme_cse_iocs_nvm[256] = {
> > +[NVME_CMD_FLUSH]= NVME_CMD_EFF_CSUPP |
> NVME_CMD_EFF_LBCC,
> > +[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP |
> NVME_CMD_EFF_LBCC,
> > +[NVME_CMD_WRITE]= NVME_CMD_EFF_CSUPP |
> NVME_CMD_EFF_LBCC,
> > +[NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
> > +};
> > +
> >  static void nvme_process_sq(void *opaque);
> >
> >  static uint16_t nvme_cid(NvmeRequest *req)
> > @@ -1032,10 +1054,6 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n,
> NvmeRequest *req)
> >  trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
> >req->cmd.opcode, 
> > nvme_io_opc_str(req->cmd.opcode));
> >
> > -if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_ADMIN_ONLY) {
> > -return NVME_INVALID_OPCODE | NVME_DNR;
> > -}
> > -
> 
> I would assume the device to respond with invalid opcode before
> validating the nsid if it is an admin only device.

The host can't make any assumptions about the ordering of validation
checks performed by the controller. In the case of receiving an i/o
command with invalid NSID when CC.CSS == ADMIN_ONLY, both
Invalid Opcode and Invalid NSID error status codes should be

RE: [PATCH v7 08/11] hw/block/nvme: Add injection of Offline/Read-Only zones

2020-10-20 Thread Dmitry Fomichev
> -Original Message-
> From: Klaus Jensen 
> Sent: Monday, October 19, 2020 7:43 AM
> To: Dmitry Fomichev 
> Cc: Keith Busch ; Klaus Jensen
> ; Kevin Wolf ; Philippe
> Mathieu-Daudé ; Maxim Levitsky
> ; Fam Zheng ; Niklas Cassel
> ; Damien Le Moal ;
> qemu-bl...@nongnu.org; qemu-devel@nongnu.org; Alistair Francis
> ; Matias Bjorling 
> Subject: Re: [PATCH v7 08/11] hw/block/nvme: Add injection of Offline/Read-
> Only zones
> 
> On Oct 19 11:17, Dmitry Fomichev wrote:
> > ZNS specification defines two zone conditions for the zones that no
> > longer can function properly, possibly because of flash wear or other
> > internal fault. It is useful to be able to "inject" a small number of
> > such zones for testing purposes.
> >
> > This commit defines two optional device properties, "offline_zones"
> > and "rdonly_zones". Users can assign non-zero values to these variables
> > to specify the number of zones to be initialized as Offline or
> > Read-Only. The actual number of injected zones may be smaller than the
> > requested amount - Read-Only and Offline counts are expected to be much
> > smaller than the total number of zones on a drive.
> >
> > Signed-off-by: Dmitry Fomichev 
> > ---
> >  hw/block/nvme-ns.c | 64
> ++
> >  hw/block/nvme-ns.h |  2 ++
> >  2 files changed, 66 insertions(+)
> >
> > diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> > index 255ded2b43..d050f97909 100644
> > --- a/hw/block/nvme-ns.c
> > +++ b/hw/block/nvme-ns.c
> > @@ -21,6 +21,7 @@
> >  #include "sysemu/sysemu.h"
> >  #include "sysemu/block-backend.h"
> >  #include "qapi/error.h"
> > +#include "crypto/random.h"
> >
> >  #include "hw/qdev-properties.h"
> >  #include "hw/qdev-core.h"
> > @@ -132,6 +133,32 @@ static int
> nvme_calc_zone_geometry(NvmeNamespace *ns, Error **errp)
> >  return -1;
> >  }
> >
> > +if (ns->params.zd_extension_size) {
> > +if (ns->params.zd_extension_size & 0x3f) {
> > +error_setg(errp,
> > +"zone descriptor extension size must be a multiple of 
> > 64B");
> > +return -1;
> > +}
> > +if ((ns->params.zd_extension_size >> 6) > 0xff) {
> > +error_setg(errp, "zone descriptor extension size is too 
> > large");
> > +return -1;
> > +}
> > +}
> 
> Looks like this should have been added in the previous patch.

Right, this belongs to ZDE patch. 


[PATCH v7 10/11] hw/block/nvme: Separate read and write handlers

2020-10-18 Thread Dmitry Fomichev
With ZNS support in place, the majority of code in nvme_rw() has
become read- or write-specific. Move these parts to two separate
handlers, nvme_read() and nvme_write() to make the code more
readable and to remove multiple is_write checks that so far existed
in the i/o path.

This is a refactoring patch, no change in functionality.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c   | 191 +-
 hw/block/trace-events |   3 +-
 2 files changed, 114 insertions(+), 80 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 3b9ea326d7..5ec4ce5e28 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1162,10 +1162,10 @@ typedef struct NvmeReadFillCtx {
 uint32_t  post_rd_fill_nlb;
 } NvmeReadFillCtx;
 
-static uint16_t nvme_check_zone_read(NvmeNamespace *ns, NvmeZone *zone,
- uint64_t slba, uint32_t nlb,
- NvmeReadFillCtx *rfc)
+static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
+ uint32_t nlb, NvmeReadFillCtx *rfc)
 {
+NvmeZone *zone = nvme_get_zone_by_slba(ns, slba);
 NvmeZone *next_zone;
 uint64_t bndry = nvme_zone_rd_boundary(ns, zone);
 uint64_t end = slba + nlb, wp1, wp2;
@@ -1449,6 +1449,86 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
+static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
+NvmeNamespace *ns = req->ns;
+uint64_t slba = le64_to_cpu(rw->slba);
+uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
+uint32_t fill_len;
+uint64_t data_size = nvme_l2b(ns, nlb);
+uint64_t data_offset, fill_ofs;
+NvmeReadFillCtx rfc;
+BlockBackend *blk = ns->blkconf.blk;
+uint16_t status;
+
+trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, data_size, slba);
+
+status = nvme_check_mdts(n, data_size);
+if (status) {
+trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+goto invalid;
+}
+
+status = nvme_check_bounds(n, ns, slba, nlb);
+if (status) {
+trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
+goto invalid;
+}
+
+if (ns->params.zoned) {
+status = nvme_check_zone_read(ns, slba, nlb, );
+if (status != NVME_SUCCESS) {
+trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
+goto invalid;
+}
+}
+
+status = nvme_map_dptr(n, data_size, req);
+if (status) {
+goto invalid;
+}
+
+if (ns->params.zoned) {
+if (rfc.pre_rd_fill_nlb) {
+fill_ofs = nvme_l2b(ns, rfc.pre_rd_fill_slba - slba);
+fill_len = nvme_l2b(ns, rfc.pre_rd_fill_nlb);
+nvme_fill_read_data(req, fill_ofs, fill_len,
+n->params.fill_pattern);
+}
+if (!rfc.read_nlb) {
+/* No backend I/O necessary, only needed to fill the buffer */
+req->status = NVME_SUCCESS;
+return NVME_SUCCESS;
+}
+if (rfc.post_rd_fill_nlb) {
+req->fill_ofs = nvme_l2b(ns, rfc.post_rd_fill_slba - slba);
+req->fill_len = nvme_l2b(ns, rfc.post_rd_fill_nlb);
+} else {
+req->fill_len = 0;
+}
+slba = rfc.read_slba;
+data_size = nvme_l2b(ns, rfc.read_nlb);
+}
+
+data_offset = nvme_l2b(ns, slba);
+
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_READ);
+if (req->qsg.sg) {
+req->aiocb = dma_blk_read(blk, >qsg, data_offset,
+  BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+} else {
+req->aiocb = blk_aio_preadv(blk, data_offset, >iov, 0,
+nvme_rw_cb, req);
+}
+return NVME_NO_COMPLETE;
+
+invalid:
+block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
+return status | NVME_DNR;
+}
+
 static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
@@ -1495,25 +1575,20 @@ invalid:
 return status | NVME_DNR;
 }
 
-static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req, bool append)
+static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req, bool append)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
 NvmeNamespace *ns = req->ns;
-uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
 uint64_t slba = le64_to_cpu(rw->slba);
+uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
 uint64_t data_size = nvme_l2b(ns, nlb);
-uint64_t data_offset, fill_ofs;
-
+uint64_t data_offset;
 NvmeZone *zone;
-uint32_t fill_len;
-NvmeReadFillCtx rfc;
-bool is_write = rw->opcode == NVME_CMD_WRITE || append;
-enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
 BlockBackend *blk = ns->blkconf.b

[PATCH v7 11/11] hw/block/nvme: Merge nvme_write_zeroes() with nvme_write()

2020-10-18 Thread Dmitry Fomichev
nvme_write() now handles WRITE, WRITE ZEROES and ZONE_APPEND.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c   | 95 +--
 hw/block/trace-events |  1 -
 2 files changed, 28 insertions(+), 68 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 5ec4ce5e28..aa929d1edf 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1529,53 +1529,7 @@ invalid:
 return status | NVME_DNR;
 }
 
-static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
-{
-NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
-NvmeNamespace *ns = req->ns;
-uint64_t slba = le64_to_cpu(rw->slba);
-uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
-NvmeZone *zone;
-uint64_t offset = nvme_l2b(ns, slba);
-uint32_t count = nvme_l2b(ns, nlb);
-BlockBackend *blk = ns->blkconf.blk;
-uint16_t status;
-
-trace_pci_nvme_write_zeroes(nvme_cid(req), nvme_nsid(ns), slba, nlb);
-
-status = nvme_check_bounds(n, ns, slba, nlb);
-if (status) {
-trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
-return status;
-}
-
-if (ns->params.zoned) {
-zone = nvme_get_zone_by_slba(ns, slba);
-
-status = nvme_check_zone_write(n, ns, zone, slba, nlb, false);
-if (status != NVME_SUCCESS) {
-goto invalid;
-}
-
-status = nvme_auto_open_zone(ns, zone);
-if (status != NVME_SUCCESS) {
-goto invalid;
-}
-
-req->cqe.result64 = nvme_advance_zone_wp(ns, zone, nlb);
-}
-
-block_acct_start(blk_get_stats(blk), >acct, 0, BLOCK_ACCT_WRITE);
-req->aiocb = blk_aio_pwrite_zeroes(blk, offset, count,
-   BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
-return NVME_NO_COMPLETE;
-
-invalid:
-block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
-return status | NVME_DNR;
-}
-
-static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req, bool append)
+static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req, bool append, bool 
wrz)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
 NvmeNamespace *ns = req->ns;
@@ -1590,10 +1544,12 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest 
*req, bool append)
 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
  nvme_nsid(ns), nlb, data_size, slba);
 
-status = nvme_check_mdts(n, data_size);
-if (status) {
-trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
-goto invalid;
+if (!wrz) {
+status = nvme_check_mdts(n, data_size);
+if (status) {
+trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+goto invalid;
+}
 }
 
 status = nvme_check_bounds(n, ns, slba, nlb);
@@ -1628,21 +1584,26 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest 
*req, bool append)
 
 data_offset = nvme_l2b(ns, slba);
 
-status = nvme_map_dptr(n, data_size, req);
-if (status) {
-goto invalid;
-}
+if (!wrz) {
+status = nvme_map_dptr(n, data_size, req);
+if (status) {
+goto invalid;
+}
 
-data_offset = nvme_l2b(ns, slba);
-
-block_acct_start(blk_get_stats(blk), >acct, data_size,
- BLOCK_ACCT_WRITE);
-if (req->qsg.sg) {
-req->aiocb = dma_blk_write(blk, >qsg, data_offset,
-   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_WRITE);
+if (req->qsg.sg) {
+req->aiocb = dma_blk_write(blk, >qsg, data_offset,
+   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+} else {
+req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
+ nvme_rw_cb, req);
+}
 } else {
-req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
- nvme_rw_cb, req);
+block_acct_start(blk_get_stats(blk), >acct, 0, BLOCK_ACCT_WRITE);
+req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
+   BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
+   req);
 }
 return NVME_NO_COMPLETE;
 
@@ -2126,11 +2087,11 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 case NVME_CMD_FLUSH:
 return nvme_flush(n, req);
 case NVME_CMD_WRITE_ZEROES:
-return nvme_write_zeroes(n, req);
+return nvme_write(n, req, false, true);
 case NVME_CMD_ZONE_APPEND:
-return nvme_write(n, req, true);
+return nvme_write(n, req, true, false);
 case NVME_CMD_WRITE:
-return nvme_write(n, req, false);
+return nvme_write(n, req, false, false);
 case NVME_CMD_READ:
 return nvme_read(n, req);
 case NVME_CMD_ZONE_MGMT_

  1   2   3   4   >