Re: [PATCH v6 2/4] block: introduce zone append write for zoned devices

2023-03-16 Thread Stefan Hajnoczi
On Fri, Mar 10, 2023 at 06:31:04PM +0800, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of writes is pointing
> to the write pointer of that zone. Upon completion the device will
> respond with the position the data has been written in the zone.
> 
> Signed-off-by: Sam Li 
> ---
>  block/block-backend.c | 60 +++
>  block/file-posix.c| 54 +---
>  block/io.c| 21 +++
>  block/io_uring.c  |  4 +++
>  block/linux-aio.c |  3 ++
>  block/raw-format.c|  8 +
>  include/block/block-io.h  |  4 +++
>  include/block/block_int-common.h  |  5 +++
>  include/block/raw-aio.h   |  4 ++-
>  include/sysemu/block-backend-io.h |  9 +
>  10 files changed, 166 insertions(+), 6 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index f70b08e3f6..28e8f5d778 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1888,6 +1888,45 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
> BlockZoneOp op,
>  return >common;
>  }
>  
> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
> +{
> +BlkAioEmAIOCB *acb = opaque;
> +BlkRwCo *rwco = >rwco;
> +
> +rwco->ret = blk_co_zone_append(rwco->blk, >bytes,
> +   rwco->iobuf, rwco->flags);
> +blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +QEMUIOVector *qiov, BdrvRequestFlags flags,
> +BlockCompletionFunc *cb, void *opaque) {
> +BlkAioEmAIOCB *acb;
> +Coroutine *co;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk);
> +acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
> +acb->rwco = (BlkRwCo) {
> +.blk= blk,
> +.ret= NOT_DONE,
> +.flags  = flags,
> +.iobuf  = qiov,
> +};
> +acb->bytes = *offset;
> +acb->has_returned = false;
> +
> +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +aio_co_enter(blk_get_aio_context(blk), co);
> +acb->has_returned = true;
> +if (acb->rwco.ret != NOT_DONE) {
> +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +}
> +
> +return >common;
> +}

How is the resulting offset value communicated back to the caller? I
see offset being read (dereferenced) but there is no write (assignment).
Maybe this function should pass through acb->bytes = (int64_t)offset
instead so that blk_co_zone_append() can modify the offset?


signature.asc
Description: PGP signature


Re: [PATCH v6 2/4] block: introduce zone append write for zoned devices

2023-03-13 Thread Dmitry Fomichev
On Fri, 2023-03-10 at 18:31 +0800, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of writes is pointing
> to the write pointer of that zone.

s/writes is pointing to the write pointer of that zone/the call may point at any
position within the zone to which the data is being appended/

>  Upon completion the device will
> respond with the position the data

s/position the data/position where the data/

>  has been written in the zone.
> 
> Signed-off-by: Sam Li 

With nits above,

Reviewed-by: Dmitry Fomichev 

> ---
>  block/block-backend.c | 60 +++
>  block/file-posix.c    | 54 +---
>  block/io.c    | 21 +++
>  block/io_uring.c  |  4 +++
>  block/linux-aio.c |  3 ++
>  block/raw-format.c    |  8 +
>  include/block/block-io.h  |  4 +++
>  include/block/block_int-common.h  |  5 +++
>  include/block/raw-aio.h   |  4 ++-
>  include/sysemu/block-backend-io.h |  9 +
>  10 files changed, 166 insertions(+), 6 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index f70b08e3f6..28e8f5d778 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1888,6 +1888,45 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk,
> BlockZoneOp op,
>  return >common;
>  }
>  
> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
> +{
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = >rwco;
> +
> +    rwco->ret = blk_co_zone_append(rwco->blk, >bytes,
> +   rwco->iobuf, rwco->flags);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +    QEMUIOVector *qiov, BdrvRequestFlags flags,
> +    BlockCompletionFunc *cb, void *opaque) {
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +    .blk    = blk,
> +    .ret    = NOT_DONE,
> +    .flags  = flags,
> +    .iobuf  = qiov,
> +    };
> +    acb->bytes = *offset;
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +    aio_co_enter(blk_get_aio_context(blk), co);
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +    replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +    }
> +
> +    return >common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1939,6 +1978,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk,
> BlockZoneOp op,
>  return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +    QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +    if (!blk_is_available(blk)) {
> +    blk_dec_in_flight(blk);
> +    return -ENOMEDIUM;
> +    }
> +
> +    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>  BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 61ed769ac8..2ba9174778 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -160,6 +160,7 @@ typedef struct BDRVRawState {
>  bool has_write_zeroes:1;
>  bool use_linux_aio:1;
>  bool use_linux_io_uring:1;
> +    int64_t *offset; /* offset of zone append operation */
>  int page_cache_inconsistent; /* errno from fdatasync failure */
>  bool has_fallocate;
>  bool needs_alignment;
> @@ -1672,7 +1673,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData
> *aiocb)
>  ssize_t len;
>  
>  len = RETRY_ON_EINTR(
> -    (aiocb->aio_type & QEMU_AIO_WRITE) ?
> +    (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
>  qemu_pwritev(aiocb->aio_fildes,
>     aiocb->io.iov,
>     aiocb->io.niov,
> @@ -1701,7 +1702,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData
> *aiocb, char *buf)
>  ssize_t len;
>  
>  while (offset < aiocb->aio_nbytes) {
> -    if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +    if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>  len = pwrite(aiocb->aio_fildes,
>   (const char *)buf + offset,
>   

[PATCH v6 2/4] block: introduce zone append write for zoned devices

2023-03-10 Thread Sam Li
A zone append command is a write operation that specifies the first
logical block of a zone as the write position. When writing to a zoned
block device using zone append, the byte offset of writes is pointing
to the write pointer of that zone. Upon completion the device will
respond with the position the data has been written in the zone.

Signed-off-by: Sam Li 
---
 block/block-backend.c | 60 +++
 block/file-posix.c| 54 +---
 block/io.c| 21 +++
 block/io_uring.c  |  4 +++
 block/linux-aio.c |  3 ++
 block/raw-format.c|  8 +
 include/block/block-io.h  |  4 +++
 include/block/block_int-common.h  |  5 +++
 include/block/raw-aio.h   |  4 ++-
 include/sysemu/block-backend-io.h |  9 +
 10 files changed, 166 insertions(+), 6 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index f70b08e3f6..28e8f5d778 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1888,6 +1888,45 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
BlockZoneOp op,
 return >common;
 }
 
+static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
+{
+BlkAioEmAIOCB *acb = opaque;
+BlkRwCo *rwco = >rwco;
+
+rwco->ret = blk_co_zone_append(rwco->blk, >bytes,
+   rwco->iobuf, rwco->flags);
+blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+QEMUIOVector *qiov, BdrvRequestFlags flags,
+BlockCompletionFunc *cb, void *opaque) {
+BlkAioEmAIOCB *acb;
+Coroutine *co;
+IO_CODE();
+
+blk_inc_in_flight(blk);
+acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
+acb->rwco = (BlkRwCo) {
+.blk= blk,
+.ret= NOT_DONE,
+.flags  = flags,
+.iobuf  = qiov,
+};
+acb->bytes = *offset;
+acb->has_returned = false;
+
+co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
+aio_co_enter(blk_get_aio_context(blk), co);
+acb->has_returned = true;
+if (acb->rwco.ret != NOT_DONE) {
+replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+ blk_aio_complete_bh, acb);
+}
+
+return >common;
+}
+
 /*
  * Send a zone_report command.
  * offset is a byte offset from the start of the device. No alignment
@@ -1939,6 +1978,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
BlockZoneOp op,
 return ret;
 }
 
+/*
+ * Send a zone_append command.
+ */
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+int ret;
+IO_CODE();
+
+blk_inc_in_flight(blk);
+blk_wait_while_drained(blk);
+if (!blk_is_available(blk)) {
+blk_dec_in_flight(blk);
+return -ENOMEDIUM;
+}
+
+ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
+blk_dec_in_flight(blk);
+return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
 BlockDriverState *bs = blk_bs(blk);
diff --git a/block/file-posix.c b/block/file-posix.c
index 61ed769ac8..2ba9174778 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -160,6 +160,7 @@ typedef struct BDRVRawState {
 bool has_write_zeroes:1;
 bool use_linux_aio:1;
 bool use_linux_io_uring:1;
+int64_t *offset; /* offset of zone append operation */
 int page_cache_inconsistent; /* errno from fdatasync failure */
 bool has_fallocate;
 bool needs_alignment;
@@ -1672,7 +1673,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData 
*aiocb)
 ssize_t len;
 
 len = RETRY_ON_EINTR(
-(aiocb->aio_type & QEMU_AIO_WRITE) ?
+(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
 qemu_pwritev(aiocb->aio_fildes,
aiocb->io.iov,
aiocb->io.niov,
@@ -1701,7 +1702,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData 
*aiocb, char *buf)
 ssize_t len;
 
 while (offset < aiocb->aio_nbytes) {
-if (aiocb->aio_type & QEMU_AIO_WRITE) {
+if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
 len = pwrite(aiocb->aio_fildes,
  (const char *)buf + offset,
  aiocb->aio_nbytes - offset,
@@ -1794,7 +1795,7 @@ static int handle_aiocb_rw(void *opaque)
 }
 
 nbytes = handle_aiocb_rw_linear(aiocb, buf);
-if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
+if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
 char *p = buf;
 size_t count = aiocb->aio_nbytes, copy;
 int i;
@@ -2431,6 +2432,10 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, 
uint64_t offset,
 #if defined(CONFIG_BLKZONED)
 if (bs->bl.wps) {
 qemu_co_mutex_lock(>bl.wps->colock);