Re: [PATCH v4 1/3] file-posix: add the tracking of the zones write pointers

2022-10-16 Thread Damien Le Moal
On 10/16/22 23:56, Sam Li wrote:
> Since Linux doesn't have a user API to issue zone append operations to
> zoned devices from user space, the file-posix driver is modified to add
> zone append emulation using regular writes. To do this, the file-posix
> driver tracks the wp location of all zones of the device. It uses an
> array of uint64_t. The most significant bit of each wp location indicates
> if the zone type is conventional zones.
> 
> The zones wp can be changed due to the following operations issued:
> - zone reset: change the wp to the start offset of that zone
> - zone finish: change to the end location of that zone
> - write to a zone
> - zone append
> 
> Signed-off-by: Sam Li 
> ---
>  block/file-posix.c   | 144 +++
>  include/block/block-common.h |  14 +++
>  include/block/block_int-common.h |   3 +
>  3 files changed, 161 insertions(+)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 7c5a330fc1..5ff5500301 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1324,6 +1324,66 @@ static int hdev_get_max_segments(int fd, struct stat 
> *st)
>  #endif
>  }
>  
> +#if defined(CONFIG_BLKZONED)
> +static int get_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +unsigned int nrz) {
> +struct blk_zone *blkz;
> +int64_t rep_size;
> +int64_t sector = offset >> BDRV_SECTOR_BITS;
> +int ret, n = 0, i = 0;
> +rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct 
> blk_zone);
> +g_autofree struct blk_zone_report *rep = NULL;
> +
> +rep = g_malloc(rep_size);
> +blkz = (struct blk_zone *)(rep + 1);
> +while (n < nrz) {
> +memset(rep, 0, rep_size);
> +rep->sector = sector;
> +rep->nr_zones = nrz - n;
> +
> +do {
> +ret = ioctl(fd, BLKREPORTZONE, rep);
> +} while (ret != 0 && errno == EINTR);
> +if (ret != 0) {
> +error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> +fd, offset, errno);
> +return -errno;
> +}
> +
> +if (!rep->nr_zones) {
> +break;
> +}
> +
> +for (i = 0; i < rep->nr_zones; i++, n++) {
> +/*
> + * The wp tracking cares only about sequential writes required 
> and
> + * sequential write preferred zones so that the wp can advance to
> + * the right location.
> + * Use the most significant bit of the wp location to indicate 
> the
> + * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
> + */
> +if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
> +wps->wp[i] = 1ULL << 63;
> +} else {
> +wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;

Nit: For full, read-only and offline zones, the wp of a zone is undefined,
that is, its value may be total garbage and should not be used. The kernel
will normally report a wp set to zone start + zone len for these cases,
but better do the same here too. So this single line should be something
like this:

switch (blkz[i].cond) {
case BLK_ZONE_COND_FULL:
case BLK_ZONE_COND_READONLY:
/* Zone not writeable */
wps->wp[i] = (blkz[i].start + blkz[i].len) << BDRV_SECTOR_BITS;
break;
case BLK_ZONE_COND_OFFLINE:
/* Zone not writable nor readable */
wps->wp[i] = blkz[i].start << BDRV_SECTOR_BITS;
break;
default:
wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
break;
}

> +}
> +}
> +sector = blkz[i - 1].start + blkz[i - 1].len;
> +}
> +
> +return 0;
> +}
> +
> +static void update_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +unsigned int nrz) {
> +qemu_mutex_lock(>lock);
> +if (get_zones_wp(fd, wps, offset, nrz) < 0) {
> +error_report("update zone wp failed");
> +}
> +qemu_mutex_unlock(>lock);
> +}
> +#endif
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>  BDRVRawState *s = bs->opaque;
> @@ -1414,6 +1474,14 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  if (ret >= 0) {
>  bs->bl.max_active_zones = ret;
>  }
> +
> +bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
> +if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
> +error_report("report wps failed");
> +g_free(bs->bl.wps);
> +return;
> +}
> +qemu_mutex_init(>bl.wps->lock);
>  }
>  }
>  
> @@ -1725,6 +1793,25 @@ static int handle_aiocb_rw(void *opaque)
>  
>  out:
>  if (nbytes == aiocb->aio_nbytes) {
> +#if defined(CONFIG_BLKZONED)
> +if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +BlockZoneWps *wps = aiocb->bs->bl.wps;
> +int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> +if (wps) {
> +

Re: [PATCH v4 1/3] file-posix: add the tracking of the zones write pointers

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:56 +0800, Sam Li wrote:
> Since Linux doesn't have a user API to issue zone append operations to
> zoned devices from user space, the file-posix driver is modified to add
> zone append emulation using regular writes. To do this, the file-posix
> driver tracks the wp location of all zones of the device. It uses an
> array of uint64_t. The most significant bit of each wp location indicates
> if the zone type is conventional zones.
> 
> The zones wp can be changed due to the following operations issued:
> - zone reset: change the wp to the start offset of that zone
> - zone finish: change to the end location of that zone
> - write to a zone
> - zone append
> 
> Signed-off-by: Sam Li 
> ---
>  block/file-posix.c   | 144 +++
>  include/block/block-common.h |  14 +++
>  include/block/block_int-common.h |   3 +
>  3 files changed, 161 insertions(+)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 7c5a330fc1..5ff5500301 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1324,6 +1324,66 @@ static int hdev_get_max_segments(int fd, struct stat
> *st)
>  #endif
>  }
>  
> +#if defined(CONFIG_BLKZONED)
> +static int get_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +    unsigned int nrz) {
> +    struct blk_zone *blkz;
> +    int64_t rep_size;

size_t

> +    int64_t sector = offset >> BDRV_SECTOR_BITS;

uint64_t

> +    int ret, n = 0, i = 0;
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct 
> blk_zone);
> +    g_autofree struct blk_zone_report *rep = NULL;
> +
> +    rep = g_malloc(rep_size);
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +    memset(rep, 0, rep_size);
> +    rep->sector = sector;
> +    rep->nr_zones = nrz - n;
> +
> +    do {
> +    ret = ioctl(fd, BLKREPORTZONE, rep);
> +    } while (ret != 0 && errno == EINTR);
> +    if (ret != 0) {
> +    error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> +    fd, offset, errno);
> +    return -errno;
> +    }
> +
> +    if (!rep->nr_zones) {
> +    break;
> +    }
> +
> +    for (i = 0; i < rep->nr_zones; i++, n++) {
> +    /*
> + * The wp tracking cares only about sequential writes required 
> and
> + * sequential write preferred zones so that the wp can advance to
> + * the right location.
> + * Use the most significant bit of the wp location to indicate 
> the
> + * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
> + */
> +    if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
> +    wps->wp[i] = 1ULL << 63;
> +    } else {
> +    wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> +    }
> +    }
> +    sector = blkz[i - 1].start + blkz[i - 1].len;
> +    }
> +
> +    return 0;
> +}
> +
> +static void update_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +    unsigned int nrz) {
> +    qemu_mutex_lock(>lock);
> +    if (get_zones_wp(fd, wps, offset, nrz) < 0) {
> +    error_report("update zone wp failed");
> +    }
> +    qemu_mutex_unlock(>lock);
> +}
> +#endif
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>  BDRVRawState *s = bs->opaque;
> @@ -1414,6 +1474,14 @@ static void raw_refresh_limits(BlockDriverState *bs,
> Error **errp)
>  if (ret >= 0) {
>  bs->bl.max_active_zones = ret;
>  }
> +
> +    bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
> +    if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
> +    error_report("report wps failed");
> +    g_free(bs->bl.wps);
> +    return;
> +    }
> +    qemu_mutex_init(>bl.wps->lock);
>  }
>  }
>  
> @@ -1725,6 +1793,25 @@ static int handle_aiocb_rw(void *opaque)
>  
>  out:
>  if (nbytes == aiocb->aio_nbytes) {
> +#if defined(CONFIG_BLKZONED)
> +    if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +    BlockZoneWps *wps = aiocb->bs->bl.wps;
> +    int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> +    if (wps) {

In my testing, I get a divide by zero exception in the "index"
calculation above. Changing this part as follows

-int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
-if (wps) {
+if (wps && aiocb->bs->bl.zone_size) {
+int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
+

fixes the crash.

> +    qemu_mutex_lock(>lock);
> +    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> +    uint64_t wend_offset =
> +    aiocb->aio_offset + aiocb->aio_nbytes;
> +
> +    /* Advance the wp if needed */
> +    if (wend_offset > wps->wp[index]) {
> +  

[PATCH v4 1/3] file-posix: add the tracking of the zones write pointers

2022-10-16 Thread Sam Li
Since Linux doesn't have a user API to issue zone append operations to
zoned devices from user space, the file-posix driver is modified to add
zone append emulation using regular writes. To do this, the file-posix
driver tracks the wp location of all zones of the device. It uses an
array of uint64_t. The most significant bit of each wp location indicates
if the zone type is conventional zones.

The zones wp can be changed due to the following operations issued:
- zone reset: change the wp to the start offset of that zone
- zone finish: change to the end location of that zone
- write to a zone
- zone append

Signed-off-by: Sam Li 
---
 block/file-posix.c   | 144 +++
 include/block/block-common.h |  14 +++
 include/block/block_int-common.h |   3 +
 3 files changed, 161 insertions(+)

diff --git a/block/file-posix.c b/block/file-posix.c
index 7c5a330fc1..5ff5500301 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1324,6 +1324,66 @@ static int hdev_get_max_segments(int fd, struct stat *st)
 #endif
 }
 
+#if defined(CONFIG_BLKZONED)
+static int get_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
+unsigned int nrz) {
+struct blk_zone *blkz;
+int64_t rep_size;
+int64_t sector = offset >> BDRV_SECTOR_BITS;
+int ret, n = 0, i = 0;
+rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+g_autofree struct blk_zone_report *rep = NULL;
+
+rep = g_malloc(rep_size);
+blkz = (struct blk_zone *)(rep + 1);
+while (n < nrz) {
+memset(rep, 0, rep_size);
+rep->sector = sector;
+rep->nr_zones = nrz - n;
+
+do {
+ret = ioctl(fd, BLKREPORTZONE, rep);
+} while (ret != 0 && errno == EINTR);
+if (ret != 0) {
+error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+fd, offset, errno);
+return -errno;
+}
+
+if (!rep->nr_zones) {
+break;
+}
+
+for (i = 0; i < rep->nr_zones; i++, n++) {
+/*
+ * The wp tracking cares only about sequential writes required and
+ * sequential write preferred zones so that the wp can advance to
+ * the right location.
+ * Use the most significant bit of the wp location to indicate the
+ * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
+ */
+if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
+wps->wp[i] = 1ULL << 63;
+} else {
+wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
+}
+}
+sector = blkz[i - 1].start + blkz[i - 1].len;
+}
+
+return 0;
+}
+
+static void update_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
+unsigned int nrz) {
+qemu_mutex_lock(>lock);
+if (get_zones_wp(fd, wps, offset, nrz) < 0) {
+error_report("update zone wp failed");
+}
+qemu_mutex_unlock(>lock);
+}
+#endif
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
 BDRVRawState *s = bs->opaque;
@@ -1414,6 +1474,14 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 if (ret >= 0) {
 bs->bl.max_active_zones = ret;
 }
+
+bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
+if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
+error_report("report wps failed");
+g_free(bs->bl.wps);
+return;
+}
+qemu_mutex_init(>bl.wps->lock);
 }
 }
 
@@ -1725,6 +1793,25 @@ static int handle_aiocb_rw(void *opaque)
 
 out:
 if (nbytes == aiocb->aio_nbytes) {
+#if defined(CONFIG_BLKZONED)
+if (aiocb->aio_type & QEMU_AIO_WRITE) {
+BlockZoneWps *wps = aiocb->bs->bl.wps;
+int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
+if (wps) {
+qemu_mutex_lock(>lock);
+if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
+uint64_t wend_offset =
+aiocb->aio_offset + aiocb->aio_nbytes;
+
+/* Advance the wp if needed */
+if (wend_offset > wps->wp[index]) {
+wps->wp[index] = wend_offset;
+}
+}
+qemu_mutex_unlock(>lock);
+}
+}
+#endif
 return 0;
 } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
 if (aiocb->aio_type & QEMU_AIO_WRITE) {
@@ -1736,6 +1823,11 @@ out:
 }
 } else {
 assert(nbytes < 0);
+#if defined(CONFIG_BLKZONED)
+if (aiocb->aio_type & QEMU_AIO_WRITE) {
+update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps, 0, 1);
+}
+#endif
 return nbytes;
 }
 }
@@ -2022,14 +2114,29 @@ static int handle_aiocb_zone_report(void *opaque)
 #endif
 
 #if