On Sun, 2022-10-16 at 22:56 +0800, Sam Li wrote: > A zone append command is a write operation that specifies the first > logical block of a zone as the write position. When writing to a zoned > block device using zone append, the byte offset of writes is pointing > to the write pointer of that zone. Upon completion the device will > respond with the position the data has been written in the zone. > > Signed-off-by: Sam Li <faithilike...@gmail.com> > --- > block/block-backend.c | 65 ++++++++++++++++++++++ > block/file-posix.c | 89 +++++++++++++++++++++++++++++-- > block/io.c | 21 ++++++++ > block/raw-format.c | 8 +++ > include/block/block-io.h | 3 ++ > include/block/block_int-common.h | 5 ++ > include/block/raw-aio.h | 4 +- > include/sysemu/block-backend-io.h | 9 ++++ > 8 files changed, 198 insertions(+), 6 deletions(-) > > diff --git a/block/block-backend.c b/block/block-backend.c > index 1c618e9c68..06931ddd24 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo { > struct { > unsigned long op; > } zone_mgmt; > + struct { > + int64_t *append_sector; > + } zone_append; > }; > } BlkRwCo; > > @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > return &acb->common; > } > > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) > +{ > + BlkAioEmAIOCB *acb = opaque; > + BlkRwCo *rwco = &acb->rwco; > + > + rwco->ret = blk_co_zone_append(rwco->blk, > rwco->zone_append.append_sector, > + rwco->iobuf, rwco->flags); > + blk_aio_complete(acb); > +} > + > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, BdrvRequestFlags flags, > + BlockCompletionFunc *cb, void *opaque) { > + BlkAioEmAIOCB *acb; > + Coroutine *co; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > + acb->rwco = (BlkRwCo) { > + .blk = blk, > + .ret = NOT_DONE, > + .flags = flags, > + .iobuf = qiov, > + .zone_append = { > + .append_sector = offset, > + }, > + }; > + acb->has_returned = false; > + > + co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); > + bdrv_coroutine_enter(blk_bs(blk), co); > + acb->has_returned = true; > + if (acb->rwco.ret != NOT_DONE) { > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > + blk_aio_complete_bh, acb); > + } > + > + return &acb->common; > +} > + > /* > * Send a zone_report command. > * offset is a byte offset from the start of the device. No alignment > @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > return ret; > } > > +/* > + * Send a zone_append command. > + */ > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, BdrvRequestFlags flags) > +{ > + int ret; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + blk_wait_while_drained(blk); > + if (!blk_is_available(blk)) { > + blk_dec_in_flight(blk); > + return -ENOMEDIUM; > + } > + > + ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags); > + blk_dec_in_flight(blk); > + return ret; > +} > + > void blk_drain(BlockBackend *blk) > { > BlockDriverState *bs = blk_bs(blk); > diff --git a/block/file-posix.c b/block/file-posix.c > index 5ff5500301..3d0cc33d02 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData { > struct { > struct iovec *iov; > int niov; > + int64_t *offset; > } io; > struct { > uint64_t cmd; > @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, > Error **errp) > bs->bl.max_active_zones = ret; > } > > + ret = get_sysfs_long_val(&st, "physical_block_size"); > + if (ret >= 0) { > + bs->bl.write_granularity = ret; > + } > + > bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret); > if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) { > error_report("report wps failed"); > @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int > nr_iov, off_t offset) > static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) > { > ssize_t len; > + BlockZoneWps *wps = aiocb->bs->bl.wps; > + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
Can this code ever be called for a non-zoned device with 0 zone size? If yes, you need to avoid division by zero here... > + > + if (wps) { > + qemu_mutex_lock(&wps->lock); > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > + aiocb->aio_offset = wps->wp[index]; > + } > + } > > do { > - if (aiocb->aio_type & QEMU_AIO_WRITE) > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) > len = qemu_pwritev(aiocb->aio_fildes, > aiocb->io.iov, > aiocb->io.niov, > @@ -1660,6 +1675,9 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData > *aiocb) > aiocb->io.niov, > aiocb->aio_offset); > } while (len == -1 && errno == EINTR); > + if (wps) { > + qemu_mutex_unlock(&wps->lock); > + } > > if (len == -1) { > return -errno; > @@ -1677,9 +1695,17 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData > *aiocb, char *buf) > { > ssize_t offset = 0; > ssize_t len; > + BlockZoneWps *wps = aiocb->bs->bl.wps; > + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; same here > > + if (wps) { > + qemu_mutex_lock(&wps->lock); > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > + aiocb->aio_offset = wps->wp[index]; > + } > + } > while (offset < aiocb->aio_nbytes) { > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > len = pwrite(aiocb->aio_fildes, > (const char *)buf + offset, > aiocb->aio_nbytes - offset, > @@ -1709,6 +1735,9 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData > *aiocb, char *buf) > } > offset += len; > } > + if (wps) { > + qemu_mutex_unlock(&wps->lock); > + } > > return offset; > } > @@ -1772,7 +1801,7 @@ static int handle_aiocb_rw(void *opaque) > } > > nbytes = handle_aiocb_rw_linear(aiocb, buf); > - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { > + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) { > char *p = buf; > size_t count = aiocb->aio_nbytes, copy; > int i; > @@ -1794,7 +1823,7 @@ static int handle_aiocb_rw(void *opaque) > out: > if (nbytes == aiocb->aio_nbytes) { > #if defined(CONFIG_BLKZONED) > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > BlockZoneWps *wps = aiocb->bs->bl.wps; > int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; and here > if (wps) { > @@ -1802,6 +1831,9 @@ out: > if (!BDRV_ZT_IS_CONV(wps->wp[index])) { > uint64_t wend_offset = > aiocb->aio_offset + aiocb->aio_nbytes; > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > + *aiocb->io.offset = wps->wp[index]; > + } > > /* Advance the wp if needed */ > if (wend_offset > wps->wp[index]) { > @@ -1824,7 +1856,7 @@ out: > } else { > assert(nbytes < 0); > #if defined(CONFIG_BLKZONED) > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps, 0, 1); > } > #endif > @@ -3478,6 +3510,52 @@ static int coroutine_fn > raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > } > #endif > > +#if defined(CONFIG_BLKZONED) > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, > + int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags) { > + BDRVRawState *s = bs->opaque; > + int64_t zone_size_mask = bs->bl.zone_size - 1; > + int64_t iov_len = 0; > + int64_t len = 0; > + RawPosixAIOData acb; > + > + if (*offset & zone_size_mask) { > + error_report("sector offset %" PRId64 " is not aligned to zone size " > + "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512); > + return -EINVAL; > + } > + > + int64_t wg = bs->bl.write_granularity; > + int64_t wg_mask = wg - 1; > + for (int i = 0; i < qiov->niov; i++) { > + iov_len = qiov->iov[i].iov_len; > + if (iov_len & wg_mask) { > + error_report("len of IOVector[%d] %" PRId64 " is not aligned to " > + "block size %" PRId64 "", i, iov_len, wg); > + return -EINVAL; > + } > + len += iov_len; > + } > + > + acb = (RawPosixAIOData) { > + .bs = bs, > + .aio_fildes = s->fd, > + .aio_type = QEMU_AIO_ZONE_APPEND, > + .aio_offset = *offset, > + .aio_nbytes = len, > + .io = { > + .iov = qiov->iov, > + .niov = qiov->niov, > + .offset = offset, > + }, > + }; > + > + return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb); > +} > +#endif > + > static coroutine_fn int > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, > bool blkdev) > @@ -4253,6 +4331,7 @@ static BlockDriver bdrv_zoned_host_device = { > /* zone management operations */ > .bdrv_co_zone_report = raw_co_zone_report, > .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > + .bdrv_co_zone_append = raw_co_zone_append, > }; > #endif > > diff --git a/block/io.c b/block/io.c > index 88f707ea4d..03e1109056 100644 > --- a/block/io.c > +++ b/block/io.c > @@ -3230,6 +3230,27 @@ out: > return co.ret; > } > > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + IO_CODE(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_zone_append) { > + co.ret = -ENOTSUP; > + goto out; > + } > + co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > void *qemu_blockalign(BlockDriverState *bs, size_t size) > { > IO_CODE(); > diff --git a/block/raw-format.c b/block/raw-format.c > index 18dc52a150..33bff8516e 100644 > --- a/block/raw-format.c > +++ b/block/raw-format.c > @@ -325,6 +325,13 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState > *bs, BlockZoneOp op, > return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len); > } > > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, > + int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags) { > + return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags); > +} > + > static int64_t raw_getlength(BlockDriverState *bs) > { > int64_t len; > @@ -629,6 +636,7 @@ BlockDriver bdrv_raw = { > .bdrv_co_pdiscard = &raw_co_pdiscard, > .bdrv_co_zone_report = &raw_co_zone_report, > .bdrv_co_zone_mgmt = &raw_co_zone_mgmt, > + .bdrv_co_zone_append = &raw_co_zone_append, > .bdrv_co_block_status = &raw_co_block_status, > .bdrv_co_copy_range_from = &raw_co_copy_range_from, > .bdrv_co_copy_range_to = &raw_co_copy_range_to, > diff --git a/include/block/block-io.h b/include/block/block-io.h > index f0cdf67d33..6a54453578 100644 > --- a/include/block/block-io.h > +++ b/include/block/block-io.h > @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, > int64_t offset, > BlockZoneDescriptor *zones); > int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > int64_t offset, int64_t len); > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags); > > int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); > bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); > diff --git a/include/block/block_int-common.h b/include/block/block_int- > common.h > index e3136146aa..a7e7db5646 100644 > --- a/include/block/block_int-common.h > +++ b/include/block/block_int-common.h > @@ -701,6 +701,9 @@ struct BlockDriver { > BlockZoneDescriptor *zones); > int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp > op, > int64_t offset, int64_t len); > + int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs, > + int64_t *offset, QEMUIOVector *qiov, > + BdrvRequestFlags flags); > > /* removable device specific */ > bool (*bdrv_is_inserted)(BlockDriverState *bs); > @@ -860,6 +863,8 @@ typedef struct BlockLimits { > > /* array of write pointers' location of each zone in the zoned device. */ > BlockZoneWps *wps; > + > + int64_t write_granularity; > } BlockLimits; > > typedef struct BdrvOpBlocker BdrvOpBlocker; > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h > index 877b2240b3..53033a5ca7 100644 > --- a/include/block/raw-aio.h > +++ b/include/block/raw-aio.h > @@ -31,6 +31,7 @@ > #define QEMU_AIO_TRUNCATE 0x0080 > #define QEMU_AIO_ZONE_REPORT 0x0100 > #define QEMU_AIO_ZONE_MGMT 0x0200 > +#define QEMU_AIO_ZONE_APPEND 0x0400 > #define QEMU_AIO_TYPE_MASK \ > (QEMU_AIO_READ | \ > QEMU_AIO_WRITE | \ > @@ -41,7 +42,8 @@ > QEMU_AIO_COPY_RANGE | \ > QEMU_AIO_TRUNCATE | \ > QEMU_AIO_ZONE_REPORT | \ > - QEMU_AIO_ZONE_MGMT) > + QEMU_AIO_ZONE_MGMT | \ > + QEMU_AIO_ZONE_APPEND) > > /* AIO flags */ > #define QEMU_AIO_MISALIGNED 0x1000 > diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend- > io.h > index 1b5fc7db6b..ff9f777f52 100644 > --- a/include/sysemu/block-backend-io.h > +++ b/include/sysemu/block-backend-io.h > @@ -52,6 +52,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t > offset, > BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > int64_t offset, int64_t len, > BlockCompletionFunc *cb, void *opaque); > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, BdrvRequestFlags flags, > + BlockCompletionFunc *cb, void *opaque); > BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t > bytes, > BlockCompletionFunc *cb, void *opaque); > void blk_aio_cancel_async(BlockAIOCB *acb); > @@ -173,6 +176,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > int64_t offset, int64_t len); > int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > int64_t offset, int64_t len); > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags); > +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags); > > int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset, > int64_t bytes);