Re: [PATCH v4 2/3] block: introduce zone append write for zoned devices

2022-10-16 Thread Damien Le Moal
On 10/16/22 23:56, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of writes is pointing
> to the write pointer of that zone. Upon completion the device will
> respond with the position the data has been written in the zone.
> 
> Signed-off-by: Sam Li 
> ---
>  block/block-backend.c | 65 ++
>  block/file-posix.c| 89 +--
>  block/io.c| 21 
>  block/raw-format.c|  8 +++
>  include/block/block-io.h  |  3 ++
>  include/block/block_int-common.h  |  5 ++
>  include/block/raw-aio.h   |  4 +-
>  include/sysemu/block-backend-io.h |  9 
>  8 files changed, 198 insertions(+), 6 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index 1c618e9c68..06931ddd24 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>  struct {
>  unsigned long op;
>  } zone_mgmt;
> +struct {
> +int64_t *append_sector;

As mentioned previosuly, call this sector. "append" is already in the
zone_append struct member name

> +} zone_append;
>  };
>  } BlkRwCo;
>  
> @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
> BlockZoneOp op,
>  return >common;
>  }
>  
> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
> +{
> +BlkAioEmAIOCB *acb = opaque;
> +BlkRwCo *rwco = >rwco;
> +
> +rwco->ret = blk_co_zone_append(rwco->blk, 
> rwco->zone_append.append_sector,

...so you avoid awkward repetitions of "append" like here. You'll have:
rwco->zone_append.sector, which is shorter and more natural.

> +   rwco->iobuf, rwco->flags);
> +blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +QEMUIOVector *qiov, BdrvRequestFlags flags,
> +BlockCompletionFunc *cb, void *opaque) {
> +BlkAioEmAIOCB *acb;
> +Coroutine *co;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk);
> +acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
> +acb->rwco = (BlkRwCo) {
> +.blk= blk,
> +.ret= NOT_DONE,
> +.flags  = flags,
> +.iobuf  = qiov,
> +.zone_append = {
> +.append_sector = offset,
> +},
> +};
> +acb->has_returned = false;
> +
> +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +bdrv_coroutine_enter(blk_bs(blk), co);
> +acb->has_returned = true;
> +if (acb->rwco.ret != NOT_DONE) {
> +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +}
> +
> +return >common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
> BlockZoneOp op,
>  return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +int ret;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk);
> +blk_wait_while_drained(blk);
> +if (!blk_is_available(blk)) {
> +blk_dec_in_flight(blk);
> +return -ENOMEDIUM;
> +}
> +
> +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +blk_dec_in_flight(blk);
> +return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>  BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 5ff5500301..3d0cc33d02 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData {
>  struct {
>  struct iovec *iov;
>  int niov;
> +int64_t *offset;
>  } io;
>  struct {
>  uint64_t cmd;
> @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  bs->bl.max_active_zones = ret;
>  }
>  
> +ret = get_sysfs_long_val(, "physical_block_size");
> +if (ret >= 0) {
> +bs->bl.write_granularity = ret;
> +}
> +
>  bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
>  if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
>  error_report("report wps failed");
> @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int 
> nr_iov, off_t offset)
>  static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
>  {
>  ssize_t len;
> +

Re: [PATCH v4 1/3] file-posix: add the tracking of the zones write pointers

2022-10-16 Thread Damien Le Moal
On 10/16/22 23:56, Sam Li wrote:
> Since Linux doesn't have a user API to issue zone append operations to
> zoned devices from user space, the file-posix driver is modified to add
> zone append emulation using regular writes. To do this, the file-posix
> driver tracks the wp location of all zones of the device. It uses an
> array of uint64_t. The most significant bit of each wp location indicates
> if the zone type is conventional zones.
> 
> The zones wp can be changed due to the following operations issued:
> - zone reset: change the wp to the start offset of that zone
> - zone finish: change to the end location of that zone
> - write to a zone
> - zone append
> 
> Signed-off-by: Sam Li 
> ---
>  block/file-posix.c   | 144 +++
>  include/block/block-common.h |  14 +++
>  include/block/block_int-common.h |   3 +
>  3 files changed, 161 insertions(+)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 7c5a330fc1..5ff5500301 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1324,6 +1324,66 @@ static int hdev_get_max_segments(int fd, struct stat 
> *st)
>  #endif
>  }
>  
> +#if defined(CONFIG_BLKZONED)
> +static int get_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +unsigned int nrz) {
> +struct blk_zone *blkz;
> +int64_t rep_size;
> +int64_t sector = offset >> BDRV_SECTOR_BITS;
> +int ret, n = 0, i = 0;
> +rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct 
> blk_zone);
> +g_autofree struct blk_zone_report *rep = NULL;
> +
> +rep = g_malloc(rep_size);
> +blkz = (struct blk_zone *)(rep + 1);
> +while (n < nrz) {
> +memset(rep, 0, rep_size);
> +rep->sector = sector;
> +rep->nr_zones = nrz - n;
> +
> +do {
> +ret = ioctl(fd, BLKREPORTZONE, rep);
> +} while (ret != 0 && errno == EINTR);
> +if (ret != 0) {
> +error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> +fd, offset, errno);
> +return -errno;
> +}
> +
> +if (!rep->nr_zones) {
> +break;
> +}
> +
> +for (i = 0; i < rep->nr_zones; i++, n++) {
> +/*
> + * The wp tracking cares only about sequential writes required 
> and
> + * sequential write preferred zones so that the wp can advance to
> + * the right location.
> + * Use the most significant bit of the wp location to indicate 
> the
> + * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
> + */
> +if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
> +wps->wp[i] = 1ULL << 63;
> +} else {
> +wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;

Nit: For full, read-only and offline zones, the wp of a zone is undefined,
that is, its value may be total garbage and should not be used. The kernel
will normally report a wp set to zone start + zone len for these cases,
but better do the same here too. So this single line should be something
like this:

switch (blkz[i].cond) {
case BLK_ZONE_COND_FULL:
case BLK_ZONE_COND_READONLY:
/* Zone not writeable */
wps->wp[i] = (blkz[i].start + blkz[i].len) << BDRV_SECTOR_BITS;
break;
case BLK_ZONE_COND_OFFLINE:
/* Zone not writable nor readable */
wps->wp[i] = blkz[i].start << BDRV_SECTOR_BITS;
break;
default:
wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
break;
}

> +}
> +}
> +sector = blkz[i - 1].start + blkz[i - 1].len;
> +}
> +
> +return 0;
> +}
> +
> +static void update_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +unsigned int nrz) {
> +qemu_mutex_lock(>lock);
> +if (get_zones_wp(fd, wps, offset, nrz) < 0) {
> +error_report("update zone wp failed");
> +}
> +qemu_mutex_unlock(>lock);
> +}
> +#endif
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>  BDRVRawState *s = bs->opaque;
> @@ -1414,6 +1474,14 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  if (ret >= 0) {
>  bs->bl.max_active_zones = ret;
>  }
> +
> +bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
> +if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
> +error_report("report wps failed");
> +g_free(bs->bl.wps);
> +return;
> +}
> +qemu_mutex_init(>bl.wps->lock);
>  }
>  }
>  
> @@ -1725,6 +1793,25 @@ static int handle_aiocb_rw(void *opaque)
>  
>  out:
>  if (nbytes == aiocb->aio_nbytes) {
> +#if defined(CONFIG_BLKZONED)
> +if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +BlockZoneWps *wps = aiocb->bs->bl.wps;
> +int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> +if (wps) {
> +

Re: [PATCH v12 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls

2022-10-16 Thread Damien Le Moal
On 10/16/22 23:51, Sam Li wrote:
> Add a new zoned_host_device BlockDriver. The zoned_host_device option
> accepts only zoned host block devices. By adding zone management
> operations in this new BlockDriver, users can use the new block
> layer APIs including Report Zone and four zone management operations
> (open, close, finish, reset, reset_all).
> 
> Qemu-io uses the new APIs to perform zoned storage commands of the device:
> zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs),
> zone_finish(zf).
> 
> For example, to test zone_report, use following command:
> $ ./build/qemu-io --image-opts -n driver=zoned_host_device, 
> filename=/dev/nullb0
> -c "zrp offset nr_zones"
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Hannes Reinecke 
> ---
>  block/block-backend.c | 148 +
>  block/file-posix.c| 335 ++
>  block/io.c|  41 
>  include/block/block-io.h  |   7 +
>  include/block/block_int-common.h  |  24 +++
>  include/block/raw-aio.h   |   6 +-
>  include/sysemu/block-backend-io.h |  18 ++
>  meson.build   |   4 +
>  qapi/block-core.json  |   8 +-
>  qemu-io-cmds.c| 149 +
>  10 files changed, 737 insertions(+), 3 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index aa4adf06ae..1c618e9c68 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo {
>  void *iobuf;
>  int ret;
>  BdrvRequestFlags flags;
> +union {
> +struct {
> +unsigned int *nr_zones;
> +BlockZoneDescriptor *zones;
> +} zone_report;
> +struct {
> +unsigned long op;
> +} zone_mgmt;
> +};
>  } BlkRwCo;
>  
>  int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
> @@ -1775,6 +1784,145 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
>  return ret;
>  }
>  
> +static void coroutine_fn blk_aio_zone_report_entry(void *opaque)
> +{
> +BlkAioEmAIOCB *acb = opaque;
> +BlkRwCo *rwco = >rwco;
> +
> +rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
> +   rwco->zone_report.nr_zones,
> +   rwco->zone_report.zones);
> +blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
> +unsigned int *nr_zones,
> +BlockZoneDescriptor  *zones,
> +BlockCompletionFunc *cb, void *opaque)
> +{
> +BlkAioEmAIOCB *acb;
> +Coroutine *co;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk);
> +acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
> +acb->rwco = (BlkRwCo) {
> +.blk= blk,
> +.offset = offset,
> +.ret= NOT_DONE,
> +.zone_report = {
> +.zones = zones,
> +.nr_zones = nr_zones,
> +},
> +};
> +acb->has_returned = false;
> +
> +co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
> +bdrv_coroutine_enter(blk_bs(blk), co);
> +
> +acb->has_returned = true;
> +if (acb->rwco.ret != NOT_DONE) {
> +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +}
> +
> +return >common;
> +}
> +
> +static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque)
> +{
> +BlkAioEmAIOCB *acb = opaque;
> +BlkRwCo *rwco = >rwco;
> +
> +rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op,
> + rwco->offset, acb->bytes);
> +blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> +  int64_t offset, int64_t len,
> +  BlockCompletionFunc *cb, void *opaque) {
> +BlkAioEmAIOCB *acb;
> +Coroutine *co;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk);
> +acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
> +acb->rwco = (BlkRwCo) {
> +.blk= blk,
> +.offset = offset,
> +.ret= NOT_DONE,
> +.zone_mgmt = {
> +.op = op,
> +},
> +};
> +acb->bytes = len;
> +acb->has_returned = false;
> +
> +co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
> +bdrv_coroutine_enter(blk_bs(blk), co);
> +
> +acb->has_returned = true;
> +if (acb->rwco.ret != NOT_DONE) {
> +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +}
> +
> +return >common;
> +}
> +
> +/*
> + * Send a zone_report command.
> + * offset is a byte offset from the start of the device. No alignment
> + * required for offset.
> + * nr_zones represents IN maximum and OUT actual.
> + */
> +int 

Re: [RFC v3 1/2] include: update virtio_blk headers from Linux 5.19-rc2+

2022-10-16 Thread Damien Le Moal
On 10/17/22 09:53, Dmitry Fomichev wrote:
> On Sun, 2022-10-16 at 23:05 +0800, Sam Li wrote:
>> Use scripts/update-linux-headers.sh to update virtio-blk headers
>> from Dmitry's "virtio-blk:add support for zoned block devices"
>> linux patch. There is a link for more information:
>> https://github.com/dmitry-fomichev/virtblk-zbd
>>
>> Signed-off-by: Sam Li 
>> Reviewed-by: Stefan Hajnoczi 
>> Signed-off-by: Sam Li 
> 
> the duplicate sign-off is not needed. With this,
> 
> Reviewed-by: Dmitry Fomichev 

The mention of the linux kernel version should be removed from the patch
title as the changes are not included in any upstream kernel yet.

> 
>> ---
>>  include/standard-headers/linux/virtio_blk.h | 109 
>>  1 file changed, 109 insertions(+)
>>
>> diff --git a/include/standard-headers/linux/virtio_blk.h b/include/standard-
>> headers/linux/virtio_blk.h
>> index 2dcc90826a..490bd21c76 100644
>> --- a/include/standard-headers/linux/virtio_blk.h
>> +++ b/include/standard-headers/linux/virtio_blk.h
>> @@ -40,6 +40,7 @@
>>  #define VIRTIO_BLK_F_MQ12  /* support more than one vq 
>> */
>>  #define VIRTIO_BLK_F_DISCARD   13  /* DISCARD is supported */
>>  #define VIRTIO_BLK_F_WRITE_ZEROES  14  /* WRITE ZEROES is supported 
>> */
>> +#define VIRTIO_BLK_F_ZONED 17  /* Zoned block device */
>>  
>>  /* Legacy feature bits */
>>  #ifndef VIRTIO_BLK_NO_LEGACY
>> @@ -119,6 +120,20 @@ struct virtio_blk_config {
>> uint8_t write_zeroes_may_unmap;
>>  
>> uint8_t unused1[3];
>> +
>> +   /* Secure erase fields that are defined in the virtio spec */
>> +   uint8_t sec_erase[12];
>> +
>> +   /* Zoned block device characteristics (if VIRTIO_BLK_F_ZONED) */
>> +   struct virtio_blk_zoned_characteristics {
>> +   __virtio32 zone_sectors;
>> +   __virtio32 max_open_zones;
>> +   __virtio32 max_active_zones;
>> +   __virtio32 max_append_sectors;
>> +   __virtio32 write_granularity;
>> +   uint8_t model;
>> +   uint8_t unused2[3];
>> +   } zoned;
>>  } QEMU_PACKED;
>>  
>>  /*
>> @@ -153,6 +168,27 @@ struct virtio_blk_config {
>>  /* Write zeroes command */
>>  #define VIRTIO_BLK_T_WRITE_ZEROES  13
>>  
>> +/* Zone append command */
>> +#define VIRTIO_BLK_T_ZONE_APPEND    15
>> +
>> +/* Report zones command */
>> +#define VIRTIO_BLK_T_ZONE_REPORT    16
>> +
>> +/* Open zone command */
>> +#define VIRTIO_BLK_T_ZONE_OPEN  18
>> +
>> +/* Close zone command */
>> +#define VIRTIO_BLK_T_ZONE_CLOSE 20
>> +
>> +/* Finish zone command */
>> +#define VIRTIO_BLK_T_ZONE_FINISH    22
>> +
>> +/* Reset zone command */
>> +#define VIRTIO_BLK_T_ZONE_RESET 24
>> +
>> +/* Reset All zones command */
>> +#define VIRTIO_BLK_T_ZONE_RESET_ALL 26
>> +
>>  #ifndef VIRTIO_BLK_NO_LEGACY
>>  /* Barrier before this op. */
>>  #define VIRTIO_BLK_T_BARRIER   0x8000
>> @@ -172,6 +208,72 @@ struct virtio_blk_outhdr {
>> __virtio64 sector;
>>  };
>>  
>> +/*
>> + * Supported zoned device models.
>> + */
>> +
>> +/* Regular block device */
>> +#define VIRTIO_BLK_Z_NONE  0
>> +/* Host-managed zoned device */
>> +#define VIRTIO_BLK_Z_HM    1
>> +/* Host-aware zoned device */
>> +#define VIRTIO_BLK_Z_HA    2
>> +
>> +/*
>> + * Zone descriptor. A part of VIRTIO_BLK_T_ZONE_REPORT command reply.
>> + */
>> +struct virtio_blk_zone_descriptor {
>> +   /* Zone capacity */
>> +   __virtio64 z_cap;
>> +   /* The starting sector of the zone */
>> +   __virtio64 z_start;
>> +   /* Zone write pointer position in sectors */
>> +   __virtio64 z_wp;
>> +   /* Zone type */
>> +   uint8_t z_type;
>> +   /* Zone state */
>> +   uint8_t z_state;
>> +   uint8_t reserved[38];
>> +};
>> +
>> +struct virtio_blk_zone_report {
>> +   __virtio64 nr_zones;
>> +   uint8_t reserved[56];
>> +   struct virtio_blk_zone_descriptor zones[];
>> +};
>> +
>> +/*
>> + * Supported zone types.
>> + */
>> +
>> +/* Conventional zone */
>> +#define VIRTIO_BLK_ZT_CONV 1
>> +/* Sequential Write Required zone */
>> +#define VIRTIO_BLK_ZT_SWR  2
>> +/* Sequential Write Preferred zone */
>> +#define VIRTIO_BLK_ZT_SWP  3
>> +
>> +/*
>> + * Zone states that are available for zones of all types.
>> + */
>> +
>> +/* Not a write pointer (conventional zones only) */
>> +#define VIRTIO_BLK_ZS_NOT_WP   0
>> +/* Empty */
>> +#define VIRTIO_BLK_ZS_EMPTY    1
>> +/* Implicitly Open */
>> +#define VIRTIO_BLK_ZS_IOPEN    2
>> +/* Explicitly Open */
>> +#define VIRTIO_BLK_ZS_EOPEN    3
>> +/* Closed */
>> +#define VIRTIO_BLK_ZS_CLOSED   4
>> +/* Read-Only */
>> +#define VIRTIO_BLK_ZS_RDONLY   13
>> +/* Full */
>> +#define VIRTIO_BLK_ZS_FULL 14
>> +/* Offline */
>> +#define VIRTIO_BLK_ZS_OFFLINE  15
>> +
>>  /* Unmap this range (only valid for write zeroes command) */
>>  #define 

Re: [PATCH v12 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote:
> Add a new zoned_host_device BlockDriver. The zoned_host_device option
> accepts only zoned host block devices. By adding zone management
> operations in this new BlockDriver, users can use the new block
> layer APIs including Report Zone and four zone management operations
> (open, close, finish, reset, reset_all).
> 
> Qemu-io uses the new APIs to perform zoned storage commands of the device:
> zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs),
> zone_finish(zf).
> 
> For example, to test zone_report, use following command:
> $ ./build/qemu-io --image-opts -n driver=zoned_host_device,
> filename=/dev/nullb0
> -c "zrp offset nr_zones"
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Hannes Reinecke 
> ---
>  block/block-backend.c | 148 +
>  block/file-posix.c    | 335 ++
>  block/io.c    |  41 
>  include/block/block-io.h  |   7 +
>  include/block/block_int-common.h  |  24 +++
>  include/block/raw-aio.h   |   6 +-
>  include/sysemu/block-backend-io.h |  18 ++
>  meson.build   |   4 +
>  qapi/block-core.json  |   8 +-
>  qemu-io-cmds.c    | 149 +
>  10 files changed, 737 insertions(+), 3 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index aa4adf06ae..1c618e9c68 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo {
>  void *iobuf;
>  int ret;
>  BdrvRequestFlags flags;
> +    union {
> +    struct {
> +    unsigned int *nr_zones;
> +    BlockZoneDescriptor *zones;
> +    } zone_report;
> +    struct {
> +    unsigned long op;
> +    } zone_mgmt;
> +    };
>  } BlkRwCo;
>  
>  int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
> @@ -1775,6 +1784,145 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
>  return ret;
>  }
>  
> +static void coroutine_fn blk_aio_zone_report_entry(void *opaque)
> +{
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = >rwco;
> +
> +    rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
> +   rwco->zone_report.nr_zones,
> +   rwco->zone_report.zones);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
> +    unsigned int *nr_zones,
> +    BlockZoneDescriptor  *zones,
> +    BlockCompletionFunc *cb, void *opaque)
> +{
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +    .blk    = blk,
> +    .offset = offset,
> +    .ret    = NOT_DONE,
> +    .zone_report = {
> +    .zones = zones,
> +    .nr_zones = nr_zones,
> +    },
> +    };
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
> +    bdrv_coroutine_enter(blk_bs(blk), co);
> +
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +    replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +    }
> +
> +    return >common;
> +}
> +
> +static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque)
> +{
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = >rwco;
> +
> +    rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op,
> + rwco->offset, acb->bytes);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> +  int64_t offset, int64_t len,
> +  BlockCompletionFunc *cb, void *opaque) {
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +    .blk    = blk,
> +    .offset = offset,
> +    .ret    = NOT_DONE,
> +    .zone_mgmt = {
> +    .op = op,
> +    },
> +    };
> +    acb->bytes = len;
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
> +    bdrv_coroutine_enter(blk_bs(blk), co);
> +
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +    replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +    }
> +
> +    return >common;
> +}
> +
> +/*
> + * Send a zone_report command.
> + * offset is a byte offset from the start of the device. No alignment
> + * required for offset.
> + * nr_zones represents IN maximum and OUT actual.

Re: [RFC v3 2/2] virtio-blk: add zoned storage emulation for zoned devices

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 23:05 +0800, Sam Li wrote:
> This patch extends virtio-blk emulation to handle zoned device commands
> by calling the new block layer APIs to perform zoned device I/O on
> behalf of the guest. It supports Report Zone, four zone oparations (open,
> close, finish, reset), and Append Zone.
> 
> The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> support zoned block devices. Regular block devices(conventional zones)
> will not be set.
> 
> Then the guest os can use blkzone(8) to test those commands on zoned devices.
> Furthermore, using zonefs to test zone append write is also supported.
> 
> Signed-off-by: Sam Li 
> ---
>  hw/block/virtio-blk-common.c   |   2 +
>  hw/block/virtio-blk.c  | 412 -
>  include/hw/virtio/virtio-blk.h |  11 +-
>  3 files changed, 422 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
> index ac52d7c176..e2f8e2f6da 100644
> --- a/hw/block/virtio-blk-common.c
> +++ b/hw/block/virtio-blk-common.c
> @@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = {
>   .end = endof(struct virtio_blk_config, discard_sector_alignment)},
>  {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
>   .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> + .end = endof(struct virtio_blk_config, zoned)},
>  {}
>  };
>  
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index 8131ec2dbc..58891aea31 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -26,6 +26,9 @@
>  #include "hw/virtio/virtio-blk.h"
>  #include "dataplane/virtio-blk.h"
>  #include "scsi/constants.h"
> +#if defined(CONFIG_BLKZONED)
> +#include 
> +#endif
>  #ifdef __linux__
>  # include 
>  #endif
> @@ -55,10 +58,29 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req,
> unsigned char status)
>  {
>  VirtIOBlock *s = req->dev;
>  VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    int64_t inhdr_len, n;
> +    void *buf;
>  
>  trace_virtio_blk_req_complete(vdev, req, status);
>  
> -    stb_p(>in->status, status);
> +    iov_discard_undo(>inhdr_undo);
> +    if (virtio_ldl_p(vdev, >out.type) == VIRTIO_BLK_T_ZONE_APPEND) {
> +    inhdr_len = sizeof(struct virtio_blk_zone_append_inhdr);
> +    req->in.in_hdr->status = status;
> +    buf = req->in.in_hdr;
> +    } else {
> +    inhdr_len = sizeof(struct virtio_blk_inhdr);
> +    req->in.zone_append_inhdr->status = status;
> +    buf = req->in.zone_append_inhdr;
> +    }
> +
> +    n = iov_from_buf(req->elem.in_sg, req->elem.in_num,
> + req->in_len - inhdr_len, buf, inhdr_len);
> +    if (n != inhdr_len) {
> +    virtio_error(vdev, "Driver provided input buffer less than size of "
> + "in header");
> +    }
> +
>  iov_discard_undo(>inhdr_undo);
>  iov_discard_undo(>outhdr_undo);
>  virtqueue_push(req->vq, >elem, req->in_len);
> @@ -592,6 +614,334 @@ err:
>  return err_status;
>  }
>  
> +typedef struct ZoneCmdData {
> +    VirtIOBlockReq *req;
> +    union {
> +    struct {
> +    unsigned int nr_zones;
> +    BlockZoneDescriptor *zones;
> +    } zone_report_data;
> +    struct {
> +    int64_t offset;
> +    } zone_append_data;
> +    };
> +} ZoneCmdData;
> +
> +/*
> + * check zoned_request: error checking before issuing requests. If all checks
> + * passed, return true.
> + * append: true if only zone append requests issued.
> + */
> +static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
> + bool append, uint8_t *status) {
> +    BlockDriverState *bs = blk_bs(s->blk);
> +    int index = offset / bs->bl.zone_size;
> +
> +    if (offset < 0 || len < 0 || offset > bs->bl.capacity - len) {
> +    *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +    return false;
> +    }
> +
> +    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
> +    *status = VIRTIO_BLK_S_UNSUPP;
> +    return false;
> +    }
> +
> +    if (append) {
> +    if ((offset % bs->bl.write_granularity) != 0) {
> +    *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
> +    return false;
> +    }
> +
> +    if (BDRV_ZT_IS_CONV(bs->bl.wps->wp[index])) {
> +    *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +    return false;
> +    }
> +
> +    if (len / 512 > bs->bl.max_append_sectors) {
> +    if (bs->bl.max_append_sectors == 0) {
> +    *status = VIRTIO_BLK_S_UNSUPP;
> +    } else {
> +    *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +    }
> +    return false;
> +    }
> +    }
> +    return true;
> +}
> +
> +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> +{
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = 

Re: [PATCH v4 2/3] block: introduce zone append write for zoned devices

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:56 +0800, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of writes is pointing
> to the write pointer of that zone. Upon completion the device will
> respond with the position the data has been written in the zone.
> 
> Signed-off-by: Sam Li 
> ---
>  block/block-backend.c | 65 ++
>  block/file-posix.c    | 89 +--
>  block/io.c    | 21 
>  block/raw-format.c    |  8 +++
>  include/block/block-io.h  |  3 ++
>  include/block/block_int-common.h  |  5 ++
>  include/block/raw-aio.h   |  4 +-
>  include/sysemu/block-backend-io.h |  9 
>  8 files changed, 198 insertions(+), 6 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index 1c618e9c68..06931ddd24 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>  struct {
>  unsigned long op;
>  } zone_mgmt;
> +    struct {
> +    int64_t *append_sector;
> +    } zone_append;
>  };
>  } BlkRwCo;
>  
> @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk,
> BlockZoneOp op,
>  return >common;
>  }
>  
> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
> +{
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = >rwco;
> +
> +    rwco->ret = blk_co_zone_append(rwco->blk, 
> rwco->zone_append.append_sector,
> +   rwco->iobuf, rwco->flags);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +    QEMUIOVector *qiov, BdrvRequestFlags flags,
> +    BlockCompletionFunc *cb, void *opaque) {
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +    .blk    = blk,
> +    .ret    = NOT_DONE,
> +    .flags  = flags,
> +    .iobuf  = qiov,
> +    .zone_append = {
> +    .append_sector = offset,
> +    },
> +    };
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +    bdrv_coroutine_enter(blk_bs(blk), co);
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +    replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +    }
> +
> +    return >common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk,
> BlockZoneOp op,
>  return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +    QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +    if (!blk_is_available(blk)) {
> +    blk_dec_in_flight(blk);
> +    return -ENOMEDIUM;
> +    }
> +
> +    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>  BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 5ff5500301..3d0cc33d02 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData {
>  struct {
>  struct iovec *iov;
>  int niov;
> +    int64_t *offset;
>  } io;
>  struct {
>  uint64_t cmd;
> @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs,
> Error **errp)
>  bs->bl.max_active_zones = ret;
>  }
>  
> +    ret = get_sysfs_long_val(, "physical_block_size");
> +    if (ret >= 0) {
> +    bs->bl.write_granularity = ret;
> +    }
> +
>  bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
>  if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
>  error_report("report wps failed");
> @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int
> nr_iov, off_t offset)
>  static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
>  {
>  ssize_t len;
> +    BlockZoneWps *wps = aiocb->bs->bl.wps;
> +    int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;

Can this code ever be called for a non-zoned device with 0 zone size?
If yes, you need to avoid division by zero here...

> +
> +    if 

Re: [PATCH v4 1/3] file-posix: add the tracking of the zones write pointers

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:56 +0800, Sam Li wrote:
> Since Linux doesn't have a user API to issue zone append operations to
> zoned devices from user space, the file-posix driver is modified to add
> zone append emulation using regular writes. To do this, the file-posix
> driver tracks the wp location of all zones of the device. It uses an
> array of uint64_t. The most significant bit of each wp location indicates
> if the zone type is conventional zones.
> 
> The zones wp can be changed due to the following operations issued:
> - zone reset: change the wp to the start offset of that zone
> - zone finish: change to the end location of that zone
> - write to a zone
> - zone append
> 
> Signed-off-by: Sam Li 
> ---
>  block/file-posix.c   | 144 +++
>  include/block/block-common.h |  14 +++
>  include/block/block_int-common.h |   3 +
>  3 files changed, 161 insertions(+)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 7c5a330fc1..5ff5500301 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1324,6 +1324,66 @@ static int hdev_get_max_segments(int fd, struct stat
> *st)
>  #endif
>  }
>  
> +#if defined(CONFIG_BLKZONED)
> +static int get_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +    unsigned int nrz) {
> +    struct blk_zone *blkz;
> +    int64_t rep_size;

size_t

> +    int64_t sector = offset >> BDRV_SECTOR_BITS;

uint64_t

> +    int ret, n = 0, i = 0;
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct 
> blk_zone);
> +    g_autofree struct blk_zone_report *rep = NULL;
> +
> +    rep = g_malloc(rep_size);
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +    memset(rep, 0, rep_size);
> +    rep->sector = sector;
> +    rep->nr_zones = nrz - n;
> +
> +    do {
> +    ret = ioctl(fd, BLKREPORTZONE, rep);
> +    } while (ret != 0 && errno == EINTR);
> +    if (ret != 0) {
> +    error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> +    fd, offset, errno);
> +    return -errno;
> +    }
> +
> +    if (!rep->nr_zones) {
> +    break;
> +    }
> +
> +    for (i = 0; i < rep->nr_zones; i++, n++) {
> +    /*
> + * The wp tracking cares only about sequential writes required 
> and
> + * sequential write preferred zones so that the wp can advance to
> + * the right location.
> + * Use the most significant bit of the wp location to indicate 
> the
> + * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
> + */
> +    if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
> +    wps->wp[i] = 1ULL << 63;
> +    } else {
> +    wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> +    }
> +    }
> +    sector = blkz[i - 1].start + blkz[i - 1].len;
> +    }
> +
> +    return 0;
> +}
> +
> +static void update_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +    unsigned int nrz) {
> +    qemu_mutex_lock(>lock);
> +    if (get_zones_wp(fd, wps, offset, nrz) < 0) {
> +    error_report("update zone wp failed");
> +    }
> +    qemu_mutex_unlock(>lock);
> +}
> +#endif
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>  BDRVRawState *s = bs->opaque;
> @@ -1414,6 +1474,14 @@ static void raw_refresh_limits(BlockDriverState *bs,
> Error **errp)
>  if (ret >= 0) {
>  bs->bl.max_active_zones = ret;
>  }
> +
> +    bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
> +    if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
> +    error_report("report wps failed");
> +    g_free(bs->bl.wps);
> +    return;
> +    }
> +    qemu_mutex_init(>bl.wps->lock);
>  }
>  }
>  
> @@ -1725,6 +1793,25 @@ static int handle_aiocb_rw(void *opaque)
>  
>  out:
>  if (nbytes == aiocb->aio_nbytes) {
> +#if defined(CONFIG_BLKZONED)
> +    if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +    BlockZoneWps *wps = aiocb->bs->bl.wps;
> +    int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> +    if (wps) {

In my testing, I get a divide by zero exception in the "index"
calculation above. Changing this part as follows

-int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
-if (wps) {
+if (wps && aiocb->bs->bl.zone_size) {
+int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
+

fixes the crash.

> +    qemu_mutex_lock(>lock);
> +    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> +    uint64_t wend_offset =
> +    aiocb->aio_offset + aiocb->aio_nbytes;
> +
> +    /* Advance the wp if needed */
> +    if (wend_offset > wps->wp[index]) {
> +  

Re: [PATCH v12 6/7] qemu-iotests: test new zone operations

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote:
> We have added new block layer APIs of zoned block devices.
> Test it with:
> Create a null_blk device, run each zone operation on it and see
> whether reporting right zone information.

change this to "whether the logs show the correct zone information"?

> 

Could you please describe how to run this specific set of tests
in more detail?
 
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 
> ---
>  tests/qemu-iotests/tests/zoned.out | 53 ++
>  tests/qemu-iotests/tests/zoned.sh  | 86 ++
>  2 files changed, 139 insertions(+)
>  create mode 100644 tests/qemu-iotests/tests/zoned.out
>  create mode 100755 tests/qemu-iotests/tests/zoned.sh
> 
> diff --git a/tests/qemu-iotests/tests/zoned.out b/tests/qemu-
> iotests/tests/zoned.out
> new file mode 100644
> index 00..0c8f96deb9
> --- /dev/null
> +++ b/tests/qemu-iotests/tests/zoned.out
> @@ -0,0 +1,53 @@
> +QA output created by zoned.sh
> +Testing a null_blk device:
> +Simple cases: if the operations work
> +(1) report the first zone:
> +start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:1, [type: 2]
> +
> +report the first 10 zones
> +start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:1, [type: 2]
> +start: 0x8, len 0x8, cap 0x8, wptr 0x8, zcond:1, [type: 2]
> +start: 0x10, len 0x8, cap 0x8, wptr 0x10, zcond:1, [type: 2]
> +start: 0x18, len 0x8, cap 0x8, wptr 0x18, zcond:1, [type: 2]
> +start: 0x20, len 0x8, cap 0x8, wptr 0x20, zcond:1, [type: 2]
> +start: 0x28, len 0x8, cap 0x8, wptr 0x28, zcond:1, [type: 2]
> +start: 0x30, len 0x8, cap 0x8, wptr 0x30, zcond:1, [type: 2]
> +start: 0x38, len 0x8, cap 0x8, wptr 0x38, zcond:1, [type: 2]
> +start: 0x40, len 0x8, cap 0x8, wptr 0x40, zcond:1, [type: 2]
> +start: 0x48, len 0x8, cap 0x8, wptr 0x48, zcond:1, [type: 2]
> +
> +report the last zone:
> +start: 0x1f38, len 0x8, cap 0x8, wptr 0x1f38, zcond:1, [type:
> 2]
> +
> +
> +(2) opening the first zone
> +report after:
> +start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:3, [type: 2]
> +
> +opening the second zone
> +report after:
> +start: 0x8, len 0x8, cap 0x8, wptr 0x8, zcond:3, [type: 2]
> +
> +opening the last zone
> +report after:
> +start: 0x1f38, len 0x8, cap 0x8, wptr 0x1f38, zcond:3, [type:
> 2]
> +
> +
> +(3) closing the first zone
> +report after:
> +start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:1, [type: 2]
> +
> +closing the last zone
> +report after:
> +start: 0x1f38, len 0x8, cap 0x8, wptr 0x1f38, zcond:1, [type:
> 2]
> +
> +
> +(4) finishing the second zone
> +After finishing a zone:
> +start: 0x8, len 0x8, cap 0x8, wptr 0x10, zcond:14, [type: 2]
> +
> +
> +(5) resetting the second zone
> +After resetting a zone:
> +start: 0x8, len 0x8, cap 0x8, wptr 0x8, zcond:1, [type: 2]
> +*** done
> diff --git a/tests/qemu-iotests/tests/zoned.sh b/tests/qemu-
> iotests/tests/zoned.sh
> new file mode 100755
> index 00..fced0194c5
> --- /dev/null
> +++ b/tests/qemu-iotests/tests/zoned.sh
> @@ -0,0 +1,86 @@
> +#!/usr/bin/env bash
> +#
> +# Test zone management operations.
> +#
> +
> +seq="$(basename $0)"
> +echo "QA output created by $seq"
> +status=1 # failure is the default!
> +
> +_cleanup()
> +{
> +  _cleanup_test_img
> +  sudo rmmod null_blk
> +}
> +trap "_cleanup; exit \$status" 0 1 2 3 15
> +
> +# get standard environment, filters and checks
> +. ./common.rc
> +. ./common.filter
> +. ./common.qemu
> +
> +# This test only runs on Linux hosts with raw image files.
> +_supported_fmt raw
> +_supported_proto file
> +_supported_os Linux
> +
> +QEMU_IO="build/qemu-io"
> +IMG="--image-opts -n driver=zoned_host_device,filename=/dev/nullb0"
> +QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
> +
> +echo "Testing a null_blk device:"
> +echo "case 1: if the operations work"
> +sudo modprobe null_blk nr_devices=1 zoned=1
> +
> +echo "(1) report the first zone:"
> +sudo $QEMU_IO $IMG -c "zrp 0 1"
> +echo
> +echo "report the first 10 zones"
> +sudo $QEMU_IO $IMG -c "zrp 0 10"
> +echo
> +echo "report the last zone:"
> +sudo $QEMU_IO $IMG -c "zrp 0x3e7000 2" # 0x3e7000 / 512 = 0x1f38
> +echo
> +echo
> +echo "(2) opening the first zone"
> +sudo $QEMU_IO $IMG -c "zo 0 268435456"  # 268435456 / 512 = 524288
> +echo "report after:"
> +sudo $QEMU_IO $IMG -c "zrp 0 1"
> +echo
> +echo "opening the second zone"
> +sudo $QEMU_IO $IMG -c "zo 268435456 268435456" #
> +echo "report after:"
> +sudo $QEMU_IO $IMG -c "zrp 268435456 1"
> +echo
> +echo "opening the last zone"
> +sudo $QEMU_IO $IMG -c "zo 0x3e7000 268435456"
> +echo "report after:"
> +sudo $QEMU_IO $IMG -c "zrp 0x3e7000 2"
> +echo
> +echo
> +echo "(3) closing the first zone"
> +sudo $QEMU_IO $IMG -c "zc 0 268435456"
> +echo "report after:"
> +sudo 

Re: [PATCH v12 5/7] config: add check to block layer

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote:
> Putting zoned/non-zoned BlockDrivers on top of each other is not
> allowed.
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 
> Reviewed-by: Hannes Reinecke 

Reviewed-by: Dmitry Fomichev 

> ---
>  block.c  | 19 +++
>  block/file-posix.c   | 12 
>  block/raw-format.c   |  1 +
>  include/block/block_int-common.h |  5 +
>  4 files changed, 37 insertions(+)
> 
> diff --git a/block.c b/block.c
> index 1fbf6b9e69..5d6fa4a25a 100644
> --- a/block.c
> +++ b/block.c
> @@ -7951,6 +7951,25 @@ void bdrv_add_child(BlockDriverState *parent_bs,
> BlockDriverState *child_bs,
>  return;
>  }
>  
> +    /*
> + * Non-zoned block drivers do not follow zoned storage constraints
> + * (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
> + * drivers in a graph.
> + */
> +    if (!parent_bs->drv->supports_zoned_children &&
> +    child_bs->bl.zoned == BLK_Z_HM) {
> +    /*
> + * The host-aware model allows zoned storage constraints and random
> + * write. Allow mixing host-aware and non-zoned drivers. Using
> + * host-aware device as a regular device.
> + */
> +    error_setg(errp, "Cannot add a %s child to a %s parent",
> +   child_bs->bl.zoned == BLK_Z_HM ? "zoned" : "non-zoned",
> +   parent_bs->drv->supports_zoned_children ?
> +   "support zoned children" : "not support zoned children");
> +    return;
> +    }
> +
>  if (!QLIST_EMPTY(_bs->parents)) {
>  error_setg(errp, "The node %s already has a parent",
>     child_bs->node_name);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index bd28e3eaea..7c5a330fc1 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -776,6 +776,18 @@ static int raw_open_common(BlockDriverState *bs, QDict
> *options,
>  goto fail;
>  }
>  }
> +#ifdef CONFIG_BLKZONED
> +    /*
> + * The kernel page cache does not reliably work for writes to SWR zones
> + * of zoned block device because it can not guarantee the order of 
> writes.
> + */
> +    if ((strcmp(bs->drv->format_name, "zoned_host_device") == 0) &&
> +    (!(s->open_flags & O_DIRECT))) {
> +    error_setg(errp, "driver=zoned_host_device was specified, but it "
> +   "requires cache.direct=on, which was not specified.");
> +    return -EINVAL; /* No host kernel page cache */
> +    }
> +#endif
>  
>  if (S_ISBLK(st.st_mode)) {
>  #ifdef __linux__
> diff --git a/block/raw-format.c b/block/raw-format.c
> index bac43f1d25..18dc52a150 100644
> --- a/block/raw-format.c
> +++ b/block/raw-format.c
> @@ -615,6 +615,7 @@ static void raw_child_perm(BlockDriverState *bs, BdrvChild
> *c,
>  BlockDriver bdrv_raw = {
>  .format_name  = "raw",
>  .instance_size    = sizeof(BDRVRawState),
> +    .supports_zoned_children = true,
>  .bdrv_probe   = _probe,
>  .bdrv_reopen_prepare  = _reopen_prepare,
>  .bdrv_reopen_commit   = _reopen_commit,
> diff --git a/include/block/block_int-common.h b/include/block/block_int-
> common.h
> index cdc06e77a6..37dddc603c 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -127,6 +127,11 @@ struct BlockDriver {
>   */
>  bool is_format;
>  
> +    /*
> + * Set to true if the BlockDriver supports zoned children.
> + */
> +    bool supports_zoned_children;
> +
>  /*
>   * Drivers not implementing bdrv_parse_filename nor bdrv_open should have
>   * this field set to true, except ones that are defined only by their



Re: [PATCH v12 4/7] raw-format: add zone operations to pass through requests

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote:
> raw-format driver usually sits on top of file-posix driver. It needs to
> pass through requests of zone commands.
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 
> Reviewed-by: Damien Le Moal 
> Reviewed-by: Hannes Reinecke 

Reviewed-by: Dmitry Fomichev 

> ---
>  block/raw-format.c | 13 +
>  1 file changed, 13 insertions(+)
> 
> diff --git a/block/raw-format.c b/block/raw-format.c
> index f337ac7569..bac43f1d25 100644
> --- a/block/raw-format.c
> +++ b/block/raw-format.c
> @@ -314,6 +314,17 @@ static int coroutine_fn raw_co_pdiscard(BlockDriverState
> *bs,
>  return bdrv_co_pdiscard(bs->file, offset, bytes);
>  }
>  
> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t
> offset,
> +   unsigned int *nr_zones,
> +   BlockZoneDescriptor *zones) {
> +    return bdrv_co_zone_report(bs->file->bs, offset, nr_zones, zones);
> +}
> +
> +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp 
> op,
> + int64_t offset, int64_t len) {
> +    return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
> +}
> +
>  static int64_t raw_getlength(BlockDriverState *bs)
>  {
>  int64_t len;
> @@ -615,6 +626,8 @@ BlockDriver bdrv_raw = {
>  .bdrv_co_pwritev  = _co_pwritev,
>  .bdrv_co_pwrite_zeroes = _co_pwrite_zeroes,
>  .bdrv_co_pdiscard = _co_pdiscard,
> +    .bdrv_co_zone_report  = _co_zone_report,
> +    .bdrv_co_zone_mgmt  = _co_zone_mgmt,
>  .bdrv_co_block_status = _co_block_status,
>  .bdrv_co_copy_range_from = _co_copy_range_from,
>  .bdrv_co_copy_range_to  = _co_copy_range_to,



Re: [PATCH v12 1/7] include: add zoned device structs

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote:
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 
> Reviewed-by: Damien Le Moal 
> Reviewed-by: Hannes Reinecke 
> ---
>  include/block/block-common.h | 43 
>  1 file changed, 43 insertions(+)
> 
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index fdb7306e78..36bd0e480e 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -49,6 +49,49 @@ typedef struct BlockDriver BlockDriver;
>  typedef struct BdrvChild BdrvChild;
>  typedef struct BdrvChildClass BdrvChildClass;
>  
> +typedef enum BlockZoneOp {
> +    BLK_ZO_OPEN,
> +    BLK_ZO_CLOSE,
> +    BLK_ZO_FINISH,
> +    BLK_ZO_RESET,
> +} BlockZoneOp;
> +
> +typedef enum BlockZoneModel {
> +    BLK_Z_NONE = 0x0, /* Regular block device */
> +    BLK_Z_HM = 0x1, /* Host-managed zoned block device */
> +    BLK_Z_HA = 0x2, /* Host-aware zoned block device */
> +} BlockZoneModel;
> +
> +typedef enum BlockZoneCondition {
> +    BLK_ZS_NOT_WP = 0x0,
> +    BLK_ZS_EMPTY = 0x1,
> +    BLK_ZS_IOPEN = 0x2,
> +    BLK_ZS_EOPEN = 0x3,
> +    BLK_ZS_CLOSED = 0x4,
> +    BLK_ZS_RDONLY = 0xD,
> +    BLK_ZS_FULL = 0xE,
> +    BLK_ZS_OFFLINE = 0xF,
> +} BlockZoneCondition;

The virtio-zbd specification doesn't define conditions, it uses the term
"state" instead, similar to ZNS. Please rename BlockZoneCondition to
BlockZoneState to follow the spec terminology.

> +
> +typedef enum BlockZoneType {
> +    BLK_ZT_CONV = 0x1, /* Conventional random writes supported */
> +    BLK_ZT_SWR = 0x2, /* Sequential writes required */
> +    BLK_ZT_SWP = 0x3, /* Sequential writes preferred */
> +} BlockZoneType;
> +
> +/*
> + * Zone descriptor data structure.
> + * Provides information on a zone with all position and size values in bytes.
> + */
> +typedef struct BlockZoneDescriptor {
> +    uint64_t start;
> +    uint64_t length;
> +    uint64_t cap;
> +    uint64_t wp;
> +    BlockZoneType type;
> +    BlockZoneCondition cond;

BlockZoneState state;

> +} BlockZoneDescriptor;
> +
>  typedef struct BlockDriverInfo {
>  /* in bytes, 0 if irrelevant */
>  int cluster_size;



Re: [PATCH v12 7/7] docs/zoned-storage: add zoned device documentation

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote:
> Add the documentation about the zoned device support to virtio-blk
> emulation.
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 
> Reviewed-by: Damien Le Moal 
> ---
>  docs/devel/zoned-storage.rst   | 43 ++
>  docs/system/qemu-block-drivers.rst.inc |  6 
>  2 files changed, 49 insertions(+)
>  create mode 100644 docs/devel/zoned-storage.rst
> 
> diff --git a/docs/devel/zoned-storage.rst b/docs/devel/zoned-storage.rst
> new file mode 100644
> index 00..cf169d029b
> --- /dev/null
> +++ b/docs/devel/zoned-storage.rst
> @@ -0,0 +1,43 @@
> +=
> +zoned-storage
> +=
> +
> +Zoned Block Devices (ZBDs) divide the LBA space into block regions called
> zones
> +that are larger than the LBA size. They can only allow sequential writes,
> which
> +can reduce write amplification in SSDs, and potentially lead to higher
> +throughput and increased capacity. More details about ZBDs can be found at:
> +
> +https://zonedstorage.io/docs/introduction/zoned-storage
> +
> +1. Block layer APIs for zoned storage
> +-
> +QEMU block layer has three zoned storage model:

replace it with

+QEMU block layer supports three zoned storage models:

? with this nit,

Reviewed-by: Dmitry Fomichev 

> +- BLK_Z_HM: The host-managed zoned model only allows sequential writes access
> +to zones. It supports ZBD-specific I/O commands that can be used by a host to
> +manage the zones of a device.
> +- BLK_Z_HA: The host-aware zoned model allows random write operations in
> +zones, making it backward compatible with regular block devices.
> +- BLK_Z_NONE: The non-zoned model has no zones support. It includes both
> +regular and drive-managed ZBD devices. ZBD-specific I/O commands are not
> +supported.
> +
> +The block device information resides inside BlockDriverState. QEMU uses
> +BlockLimits struct(BlockDriverState::bl) that is continuously accessed by the
> +block layer while processing I/O requests. A BlockBackend has a root pointer
> to
> +a BlockDriverState graph(for example, raw format on top of file-posix). The
> +zoned storage information can be propagated from the leaf BlockDriverState 
> all
> +the way up to the BlockBackend. If the zoned storage model in file-posix is
> +set to BLK_Z_HM, then block drivers will declare support for zoned host
> device.
> +
> +The block layer APIs support commands needed for zoned storage devices,
> +including report zones, four zone operations, and zone append.
> +
> +2. Emulating zoned storage controllers
> +--
> +When the BlockBackend's BlockLimits model reports a zoned storage device,
> users
> +like the virtio-blk emulation or the qemu-io-cmds.c utility can use block
> layer
> +APIs for zoned storage emulation or testing.
> +
> +For example, to test zone_report on a null_blk device using qemu-io is:
> +$ path/to/qemu-io --image-opts -n
> driver=zoned_host_device,filename=/dev/nullb0
> +-c "zrp offset nr_zones"
> diff --git a/docs/system/qemu-block-drivers.rst.inc b/docs/system/qemu-block-
> drivers.rst.inc
> index dfe5d2293d..0b97227fd9 100644
> --- a/docs/system/qemu-block-drivers.rst.inc
> +++ b/docs/system/qemu-block-drivers.rst.inc
> @@ -430,6 +430,12 @@ Hard disks
>    you may corrupt your host data (use the ``-snapshot`` command
>    line option or modify the device permissions accordingly).
>  
> +Zoned block devices
> +  Zoned block devices can be passed through to the guest if the emulated
> storage
> +  controller supports zoned storage. Use ``--blockdev zoned_host_device,
> +  node-name=drive0,filename=/dev/nullb0`` to pass through ``/dev/nullb0``
> +  as ``drive0``.
> +
>  Windows
>  ^^^
>  



Re: [RFC v3 1/2] include: update virtio_blk headers from Linux 5.19-rc2+

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 23:05 +0800, Sam Li wrote:
> Use scripts/update-linux-headers.sh to update virtio-blk headers
> from Dmitry's "virtio-blk:add support for zoned block devices"
> linux patch. There is a link for more information:
> https://github.com/dmitry-fomichev/virtblk-zbd
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 
> Signed-off-by: Sam Li 

the duplicate sign-off is not needed. With this,

Reviewed-by: Dmitry Fomichev 

> ---
>  include/standard-headers/linux/virtio_blk.h | 109 
>  1 file changed, 109 insertions(+)
> 
> diff --git a/include/standard-headers/linux/virtio_blk.h b/include/standard-
> headers/linux/virtio_blk.h
> index 2dcc90826a..490bd21c76 100644
> --- a/include/standard-headers/linux/virtio_blk.h
> +++ b/include/standard-headers/linux/virtio_blk.h
> @@ -40,6 +40,7 @@
>  #define VIRTIO_BLK_F_MQ12  /* support more than one vq */
>  #define VIRTIO_BLK_F_DISCARD   13  /* DISCARD is supported */
>  #define VIRTIO_BLK_F_WRITE_ZEROES  14  /* WRITE ZEROES is supported 
> */
> +#define VIRTIO_BLK_F_ZONED 17  /* Zoned block device */
>  
>  /* Legacy feature bits */
>  #ifndef VIRTIO_BLK_NO_LEGACY
> @@ -119,6 +120,20 @@ struct virtio_blk_config {
> uint8_t write_zeroes_may_unmap;
>  
> uint8_t unused1[3];
> +
> +   /* Secure erase fields that are defined in the virtio spec */
> +   uint8_t sec_erase[12];
> +
> +   /* Zoned block device characteristics (if VIRTIO_BLK_F_ZONED) */
> +   struct virtio_blk_zoned_characteristics {
> +   __virtio32 zone_sectors;
> +   __virtio32 max_open_zones;
> +   __virtio32 max_active_zones;
> +   __virtio32 max_append_sectors;
> +   __virtio32 write_granularity;
> +   uint8_t model;
> +   uint8_t unused2[3];
> +   } zoned;
>  } QEMU_PACKED;
>  
>  /*
> @@ -153,6 +168,27 @@ struct virtio_blk_config {
>  /* Write zeroes command */
>  #define VIRTIO_BLK_T_WRITE_ZEROES  13
>  
> +/* Zone append command */
> +#define VIRTIO_BLK_T_ZONE_APPEND    15
> +
> +/* Report zones command */
> +#define VIRTIO_BLK_T_ZONE_REPORT    16
> +
> +/* Open zone command */
> +#define VIRTIO_BLK_T_ZONE_OPEN  18
> +
> +/* Close zone command */
> +#define VIRTIO_BLK_T_ZONE_CLOSE 20
> +
> +/* Finish zone command */
> +#define VIRTIO_BLK_T_ZONE_FINISH    22
> +
> +/* Reset zone command */
> +#define VIRTIO_BLK_T_ZONE_RESET 24
> +
> +/* Reset All zones command */
> +#define VIRTIO_BLK_T_ZONE_RESET_ALL 26
> +
>  #ifndef VIRTIO_BLK_NO_LEGACY
>  /* Barrier before this op. */
>  #define VIRTIO_BLK_T_BARRIER   0x8000
> @@ -172,6 +208,72 @@ struct virtio_blk_outhdr {
> __virtio64 sector;
>  };
>  
> +/*
> + * Supported zoned device models.
> + */
> +
> +/* Regular block device */
> +#define VIRTIO_BLK_Z_NONE  0
> +/* Host-managed zoned device */
> +#define VIRTIO_BLK_Z_HM    1
> +/* Host-aware zoned device */
> +#define VIRTIO_BLK_Z_HA    2
> +
> +/*
> + * Zone descriptor. A part of VIRTIO_BLK_T_ZONE_REPORT command reply.
> + */
> +struct virtio_blk_zone_descriptor {
> +   /* Zone capacity */
> +   __virtio64 z_cap;
> +   /* The starting sector of the zone */
> +   __virtio64 z_start;
> +   /* Zone write pointer position in sectors */
> +   __virtio64 z_wp;
> +   /* Zone type */
> +   uint8_t z_type;
> +   /* Zone state */
> +   uint8_t z_state;
> +   uint8_t reserved[38];
> +};
> +
> +struct virtio_blk_zone_report {
> +   __virtio64 nr_zones;
> +   uint8_t reserved[56];
> +   struct virtio_blk_zone_descriptor zones[];
> +};
> +
> +/*
> + * Supported zone types.
> + */
> +
> +/* Conventional zone */
> +#define VIRTIO_BLK_ZT_CONV 1
> +/* Sequential Write Required zone */
> +#define VIRTIO_BLK_ZT_SWR  2
> +/* Sequential Write Preferred zone */
> +#define VIRTIO_BLK_ZT_SWP  3
> +
> +/*
> + * Zone states that are available for zones of all types.
> + */
> +
> +/* Not a write pointer (conventional zones only) */
> +#define VIRTIO_BLK_ZS_NOT_WP   0
> +/* Empty */
> +#define VIRTIO_BLK_ZS_EMPTY    1
> +/* Implicitly Open */
> +#define VIRTIO_BLK_ZS_IOPEN    2
> +/* Explicitly Open */
> +#define VIRTIO_BLK_ZS_EOPEN    3
> +/* Closed */
> +#define VIRTIO_BLK_ZS_CLOSED   4
> +/* Read-Only */
> +#define VIRTIO_BLK_ZS_RDONLY   13
> +/* Full */
> +#define VIRTIO_BLK_ZS_FULL 14
> +/* Offline */
> +#define VIRTIO_BLK_ZS_OFFLINE  15
> +
>  /* Unmap this range (only valid for write zeroes command) */
>  #define VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP 0x0001
>  
> @@ -198,4 +300,11 @@ struct virtio_scsi_inhdr {
>  #define VIRTIO_BLK_S_OK0
>  #define VIRTIO_BLK_S_IOERR 1
>  #define VIRTIO_BLK_S_UNSUPP2
> +
> +/* Error codes that are specific to zoned block devices */
> +#define VIRTIO_BLK_S_ZONE_INVALID_CMD 3
> +#define 

Re: [PATCH v12 2/7] file-posix: introduce helper functions for sysfs attributes

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote:
> Use get_sysfs_str_val() to get the string value of device
> zoned model. Then get_sysfs_zoned_model() can convert it to
> BlockZoneModel type of QEMU.
> 
> Use get_sysfs_long_val() to get the long value of zoned device
> information.
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Hannes Reinecke 
> Reviewed-by: Stefan Hajnoczi 
> Reviewed-by: Damien Le Moal 

Reviewed-by: Dmitry Fomichev 

> ---
>  block/file-posix.c   | 124 ++-
>  include/block/block_int-common.h |   3 +
>  2 files changed, 91 insertions(+), 36 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 23acffb9a4..8cb07fdb8a 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1201,66 +1201,112 @@ static int hdev_get_max_hw_transfer(int fd, struct
> stat *st)
>  #endif
>  }
>  
> -static int hdev_get_max_segments(int fd, struct stat *st)
> -{
> +/*
> + * Get a sysfs attribute value as character string.
> + */
> +static int get_sysfs_str_val(struct stat *st, const char *attribute,
> + char **val) {
>  #ifdef CONFIG_LINUX
> -    char buf[32];
> -    const char *end;
> -    char *sysfspath = NULL;
> +    g_autofree char *sysfspath = NULL;
>  int ret;
> -    int sysfd = -1;
> -    long max_segments;
> +    size_t len;
>  
> -    if (S_ISCHR(st->st_mode)) {
> -    if (ioctl(fd, SG_GET_SG_TABLESIZE, ) == 0) {
> -    return ret;
> -    }
> +    if (!S_ISBLK(st->st_mode)) {
>  return -ENOTSUP;
>  }
>  
> -    if (!S_ISBLK(st->st_mode)) {
> -    return -ENOTSUP;
> +    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
> +    major(st->st_rdev), minor(st->st_rdev),
> +    attribute);
> +    ret = g_file_get_contents(sysfspath, val, , NULL);
> +    if (ret == -1) {
> +    return -ENOENT;
>  }
>  
> -    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> -    major(st->st_rdev), minor(st->st_rdev));
> -    sysfd = open(sysfspath, O_RDONLY);
> -    if (sysfd == -1) {
> -    ret = -errno;
> -    goto out;
> +    /* The file is ended with '\n' */
> +    char *p;
> +    p = *val;
> +    if (*(p + len - 1) == '\n') {
> +    *(p + len - 1) = '\0';
>  }
> -    do {
> -    ret = read(sysfd, buf, sizeof(buf) - 1);
> -    } while (ret == -1 && errno == EINTR);
> +    return ret;
> +#else
> +    return -ENOTSUP;
> +#endif
> +}
> +
> +static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
> +{
> +    g_autofree char *val = NULL;
> +    int ret;
> +
> +    ret = get_sysfs_str_val(st, "zoned", );
>  if (ret < 0) {
> -    ret = -errno;
> -    goto out;
> -    } else if (ret == 0) {
> -    ret = -EIO;
> -    goto out;
> +    return ret;
>  }
> -    buf[ret] = 0;
> -    /* The file is ended with '\n', pass 'end' to accept that. */
> -    ret = qemu_strtol(buf, , 10, _segments);
> -    if (ret == 0 && end && *end == '\n') {
> -    ret = max_segments;
> +
> +    if (strcmp(val, "host-managed") == 0) {
> +    *zoned = BLK_Z_HM;
> +    } else if (strcmp(val, "host-aware") == 0) {
> +    *zoned = BLK_Z_HA;
> +    } else if (strcmp(val, "none") == 0) {
> +    *zoned = BLK_Z_NONE;
> +    } else {
> +    return -ENOTSUP;
>  }
> +    return 0;
> +}
>  
> -out:
> -    if (sysfd != -1) {
> -    close(sysfd);
> +/*
> + * Get a sysfs attribute value as a long integer.
> + */
> +static long get_sysfs_long_val(struct stat *st, const char *attribute)
> +{
> +#ifdef CONFIG_LINUX
> +    g_autofree char *str = NULL;
> +    const char *end;
> +    long val;
> +    int ret;
> +
> +    ret = get_sysfs_str_val(st, attribute, );
> +    if (ret < 0) {
> +    return ret;
> +    }
> +
> +    /* The file is ended with '\n', pass 'end' to accept that. */
> +    ret = qemu_strtol(str, , 10, );
> +    if (ret == 0 && end && *end == '\0') {
> +    ret = val;
>  }
> -    g_free(sysfspath);
>  return ret;
>  #else
>  return -ENOTSUP;
>  #endif
>  }
>  
> +static int hdev_get_max_segments(int fd, struct stat *st)
> +{
> +#ifdef CONFIG_LINUX
> +    int ret;
> +
> +    if (S_ISCHR(st->st_mode)) {
> +    if (ioctl(fd, SG_GET_SG_TABLESIZE, ) == 0) {
> +    return ret;
> +    }
> +    return -ENOTSUP;
> +    }
> +    return get_sysfs_long_val(st, "max_segments");
> +#else
> +    return -ENOTSUP;
> +#endif
> +}
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>  BDRVRawState *s = bs->opaque;
>  struct stat st;
> +    int ret;
> +    BlockZoneModel zoned;
>  
>  s->needs_alignment = raw_needs_alignment(bs);
>  raw_probe_alignment(bs, s->fd, errp);
> @@ -1298,6 +1344,12 @@ static void raw_refresh_limits(BlockDriverState *bs,
> Error **errp)
>  bs->bl.max_hw_iov = ret;
>  }
>  }
> +
> +    ret = 

Re: [PATCH v4 00/10] m25p80: Add SFDP support

2022-10-16 Thread Joel Stanley
On Thu, 13 Oct 2022 at 16:12, Cédric Le Goater  wrote:
>
> Hello,
>
> This patchset adds support for JEDEC STANDARD JESD216 Serial Flash
> Discovery Parameters (SFDP). SFDP describes the features of a serial
> flash device using a set of internal parameter tables. Support in
> Linux has been added some time ago and the spi-nor driver is using it
> more often to detect the flash settings and even flash models.

Reviewed-by: Joel Stanley 
Tested-by: Joel Stanley 

Thanks Cédric!



[RFC v3 1/2] include: update virtio_blk headers from Linux 5.19-rc2+

2022-10-16 Thread Sam Li
Use scripts/update-linux-headers.sh to update virtio-blk headers
from Dmitry's "virtio-blk:add support for zoned block devices"
linux patch. There is a link for more information:
https://github.com/dmitry-fomichev/virtblk-zbd

Signed-off-by: Sam Li 
Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Sam Li 
---
 include/standard-headers/linux/virtio_blk.h | 109 
 1 file changed, 109 insertions(+)

diff --git a/include/standard-headers/linux/virtio_blk.h 
b/include/standard-headers/linux/virtio_blk.h
index 2dcc90826a..490bd21c76 100644
--- a/include/standard-headers/linux/virtio_blk.h
+++ b/include/standard-headers/linux/virtio_blk.h
@@ -40,6 +40,7 @@
 #define VIRTIO_BLK_F_MQ12  /* support more than one vq */
 #define VIRTIO_BLK_F_DISCARD   13  /* DISCARD is supported */
 #define VIRTIO_BLK_F_WRITE_ZEROES  14  /* WRITE ZEROES is supported */
+#define VIRTIO_BLK_F_ZONED 17  /* Zoned block device */
 
 /* Legacy feature bits */
 #ifndef VIRTIO_BLK_NO_LEGACY
@@ -119,6 +120,20 @@ struct virtio_blk_config {
uint8_t write_zeroes_may_unmap;
 
uint8_t unused1[3];
+
+   /* Secure erase fields that are defined in the virtio spec */
+   uint8_t sec_erase[12];
+
+   /* Zoned block device characteristics (if VIRTIO_BLK_F_ZONED) */
+   struct virtio_blk_zoned_characteristics {
+   __virtio32 zone_sectors;
+   __virtio32 max_open_zones;
+   __virtio32 max_active_zones;
+   __virtio32 max_append_sectors;
+   __virtio32 write_granularity;
+   uint8_t model;
+   uint8_t unused2[3];
+   } zoned;
 } QEMU_PACKED;
 
 /*
@@ -153,6 +168,27 @@ struct virtio_blk_config {
 /* Write zeroes command */
 #define VIRTIO_BLK_T_WRITE_ZEROES  13
 
+/* Zone append command */
+#define VIRTIO_BLK_T_ZONE_APPEND15
+
+/* Report zones command */
+#define VIRTIO_BLK_T_ZONE_REPORT16
+
+/* Open zone command */
+#define VIRTIO_BLK_T_ZONE_OPEN  18
+
+/* Close zone command */
+#define VIRTIO_BLK_T_ZONE_CLOSE 20
+
+/* Finish zone command */
+#define VIRTIO_BLK_T_ZONE_FINISH22
+
+/* Reset zone command */
+#define VIRTIO_BLK_T_ZONE_RESET 24
+
+/* Reset All zones command */
+#define VIRTIO_BLK_T_ZONE_RESET_ALL 26
+
 #ifndef VIRTIO_BLK_NO_LEGACY
 /* Barrier before this op. */
 #define VIRTIO_BLK_T_BARRIER   0x8000
@@ -172,6 +208,72 @@ struct virtio_blk_outhdr {
__virtio64 sector;
 };
 
+/*
+ * Supported zoned device models.
+ */
+
+/* Regular block device */
+#define VIRTIO_BLK_Z_NONE  0
+/* Host-managed zoned device */
+#define VIRTIO_BLK_Z_HM1
+/* Host-aware zoned device */
+#define VIRTIO_BLK_Z_HA2
+
+/*
+ * Zone descriptor. A part of VIRTIO_BLK_T_ZONE_REPORT command reply.
+ */
+struct virtio_blk_zone_descriptor {
+   /* Zone capacity */
+   __virtio64 z_cap;
+   /* The starting sector of the zone */
+   __virtio64 z_start;
+   /* Zone write pointer position in sectors */
+   __virtio64 z_wp;
+   /* Zone type */
+   uint8_t z_type;
+   /* Zone state */
+   uint8_t z_state;
+   uint8_t reserved[38];
+};
+
+struct virtio_blk_zone_report {
+   __virtio64 nr_zones;
+   uint8_t reserved[56];
+   struct virtio_blk_zone_descriptor zones[];
+};
+
+/*
+ * Supported zone types.
+ */
+
+/* Conventional zone */
+#define VIRTIO_BLK_ZT_CONV 1
+/* Sequential Write Required zone */
+#define VIRTIO_BLK_ZT_SWR  2
+/* Sequential Write Preferred zone */
+#define VIRTIO_BLK_ZT_SWP  3
+
+/*
+ * Zone states that are available for zones of all types.
+ */
+
+/* Not a write pointer (conventional zones only) */
+#define VIRTIO_BLK_ZS_NOT_WP   0
+/* Empty */
+#define VIRTIO_BLK_ZS_EMPTY1
+/* Implicitly Open */
+#define VIRTIO_BLK_ZS_IOPEN2
+/* Explicitly Open */
+#define VIRTIO_BLK_ZS_EOPEN3
+/* Closed */
+#define VIRTIO_BLK_ZS_CLOSED   4
+/* Read-Only */
+#define VIRTIO_BLK_ZS_RDONLY   13
+/* Full */
+#define VIRTIO_BLK_ZS_FULL 14
+/* Offline */
+#define VIRTIO_BLK_ZS_OFFLINE  15
+
 /* Unmap this range (only valid for write zeroes command) */
 #define VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP 0x0001
 
@@ -198,4 +300,11 @@ struct virtio_scsi_inhdr {
 #define VIRTIO_BLK_S_OK0
 #define VIRTIO_BLK_S_IOERR 1
 #define VIRTIO_BLK_S_UNSUPP2
+
+/* Error codes that are specific to zoned block devices */
+#define VIRTIO_BLK_S_ZONE_INVALID_CMD 3
+#define VIRTIO_BLK_S_ZONE_UNALIGNED_WP4
+#define VIRTIO_BLK_S_ZONE_OPEN_RESOURCE   5
+#define VIRTIO_BLK_S_ZONE_ACTIVE_RESOURCE 6
+
 #endif /* _LINUX_VIRTIO_BLK_H */
-- 
2.37.3




[RFC v3 2/2] virtio-blk: add zoned storage emulation for zoned devices

2022-10-16 Thread Sam Li
This patch extends virtio-blk emulation to handle zoned device commands
by calling the new block layer APIs to perform zoned device I/O on
behalf of the guest. It supports Report Zone, four zone oparations (open,
close, finish, reset), and Append Zone.

The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
support zoned block devices. Regular block devices(conventional zones)
will not be set.

Then the guest os can use blkzone(8) to test those commands on zoned devices.
Furthermore, using zonefs to test zone append write is also supported.

Signed-off-by: Sam Li 
---
 hw/block/virtio-blk-common.c   |   2 +
 hw/block/virtio-blk.c  | 412 -
 include/hw/virtio/virtio-blk.h |  11 +-
 3 files changed, 422 insertions(+), 3 deletions(-)

diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
index ac52d7c176..e2f8e2f6da 100644
--- a/hw/block/virtio-blk-common.c
+++ b/hw/block/virtio-blk-common.c
@@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = {
  .end = endof(struct virtio_blk_config, discard_sector_alignment)},
 {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
  .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
+{.flags = 1ULL << VIRTIO_BLK_F_ZONED,
+ .end = endof(struct virtio_blk_config, zoned)},
 {}
 };
 
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 8131ec2dbc..58891aea31 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -26,6 +26,9 @@
 #include "hw/virtio/virtio-blk.h"
 #include "dataplane/virtio-blk.h"
 #include "scsi/constants.h"
+#if defined(CONFIG_BLKZONED)
+#include 
+#endif
 #ifdef __linux__
 # include 
 #endif
@@ -55,10 +58,29 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, 
unsigned char status)
 {
 VirtIOBlock *s = req->dev;
 VirtIODevice *vdev = VIRTIO_DEVICE(s);
+int64_t inhdr_len, n;
+void *buf;
 
 trace_virtio_blk_req_complete(vdev, req, status);
 
-stb_p(>in->status, status);
+iov_discard_undo(>inhdr_undo);
+if (virtio_ldl_p(vdev, >out.type) == VIRTIO_BLK_T_ZONE_APPEND) {
+inhdr_len = sizeof(struct virtio_blk_zone_append_inhdr);
+req->in.in_hdr->status = status;
+buf = req->in.in_hdr;
+} else {
+inhdr_len = sizeof(struct virtio_blk_inhdr);
+req->in.zone_append_inhdr->status = status;
+buf = req->in.zone_append_inhdr;
+}
+
+n = iov_from_buf(req->elem.in_sg, req->elem.in_num,
+ req->in_len - inhdr_len, buf, inhdr_len);
+if (n != inhdr_len) {
+virtio_error(vdev, "Driver provided input buffer less than size of "
+ "in header");
+}
+
 iov_discard_undo(>inhdr_undo);
 iov_discard_undo(>outhdr_undo);
 virtqueue_push(req->vq, >elem, req->in_len);
@@ -592,6 +614,334 @@ err:
 return err_status;
 }
 
+typedef struct ZoneCmdData {
+VirtIOBlockReq *req;
+union {
+struct {
+unsigned int nr_zones;
+BlockZoneDescriptor *zones;
+} zone_report_data;
+struct {
+int64_t offset;
+} zone_append_data;
+};
+} ZoneCmdData;
+
+/*
+ * check zoned_request: error checking before issuing requests. If all checks
+ * passed, return true.
+ * append: true if only zone append requests issued.
+ */
+static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
+ bool append, uint8_t *status) {
+BlockDriverState *bs = blk_bs(s->blk);
+int index = offset / bs->bl.zone_size;
+
+if (offset < 0 || len < 0 || offset > bs->bl.capacity - len) {
+*status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+return false;
+}
+
+if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
+*status = VIRTIO_BLK_S_UNSUPP;
+return false;
+}
+
+if (append) {
+if ((offset % bs->bl.write_granularity) != 0) {
+*status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
+return false;
+}
+
+if (BDRV_ZT_IS_CONV(bs->bl.wps->wp[index])) {
+*status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+return false;
+}
+
+if (len / 512 > bs->bl.max_append_sectors) {
+if (bs->bl.max_append_sectors == 0) {
+*status = VIRTIO_BLK_S_UNSUPP;
+} else {
+*status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+}
+return false;
+}
+}
+return true;
+}
+
+static void virtio_blk_zone_report_complete(void *opaque, int ret)
+{
+ZoneCmdData *data = opaque;
+VirtIOBlockReq *req = data->req;
+VirtIOBlock *s = req->dev;
+VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
+struct iovec *in_iov = req->elem.in_sg;
+unsigned in_num = req->elem.in_num;
+int64_t zrp_size, nz, n, j = 0;
+int8_t err_status = VIRTIO_BLK_S_OK;
+
+if (ret) {
+err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+goto out;
+}
+
+nz = 

[PATCH v4 3/3] qemu-iotests: test zone append operation

2022-10-16 Thread Sam Li
This tests is mainly a helper to indicate append writes in block layer
behaves as expected.

Signed-off-by: Sam Li 
---
 qemu-io-cmds.c | 63 ++
 tests/qemu-iotests/tests/zoned.out |  7 
 tests/qemu-iotests/tests/zoned.sh  |  9 +
 3 files changed, 79 insertions(+)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index c1b28ea108..ca92291a44 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1856,6 +1856,68 @@ static const cmdinfo_t zone_reset_cmd = {
 .oneline = "reset a zone write pointer in zone block device",
 };
 
+static int do_aio_zone_append(BlockBackend *blk, QEMUIOVector *qiov,
+  int64_t *offset, int flags, int *total)
+{
+int async_ret = NOT_DONE;
+
+blk_aio_zone_append(blk, offset, qiov, flags, aio_rw_done, _ret);
+while (async_ret == NOT_DONE) {
+main_loop_wait(false);
+}
+
+*total = qiov->size;
+return async_ret < 0 ? async_ret : 1;
+}
+
+static int zone_append_f(BlockBackend *blk, int argc, char **argv)
+{
+int ret;
+int flags = 0;
+int total = 0;
+int64_t offset;
+char *buf;
+int nr_iov;
+int pattern = 0xcd;
+QEMUIOVector qiov;
+
+if (optind > argc - 2) {
+return -EINVAL;
+}
+optind++;
+offset = cvtnum(argv[optind]);
+if (offset < 0) {
+print_cvtnum_err(offset, argv[optind]);
+return offset;
+}
+optind++;
+nr_iov = argc - optind;
+buf = create_iovec(blk, , [optind], nr_iov, pattern);
+if (buf == NULL) {
+return -EINVAL;
+}
+ret = do_aio_zone_append(blk, , , flags, );
+if (ret < 0) {
+printf("zone append failed: %s\n", strerror(-ret));
+goto out;
+}
+
+out:
+qemu_iovec_destroy();
+qemu_io_free(buf);
+return ret;
+}
+
+static const cmdinfo_t zone_append_cmd = {
+.name = "zone_append",
+.altname = "zap",
+.cfunc = zone_append_f,
+.argmin = 3,
+.argmax = 3,
+.args = "offset len [len..]",
+.oneline = "append write a number of bytes at a specified offset",
+};
+
 static int truncate_f(BlockBackend *blk, int argc, char **argv);
 static const cmdinfo_t truncate_cmd = {
 .name   = "truncate",
@@ -2653,6 +2715,7 @@ static void __attribute((constructor)) 
init_qemuio_commands(void)
 qemuio_add_command(_close_cmd);
 qemuio_add_command(_finish_cmd);
 qemuio_add_command(_reset_cmd);
+qemuio_add_command(_append_cmd);
 qemuio_add_command(_cmd);
 qemuio_add_command(_cmd);
 qemuio_add_command(_cmd);
diff --git a/tests/qemu-iotests/tests/zoned.out 
b/tests/qemu-iotests/tests/zoned.out
index 0c8f96deb9..b3b139b4ec 100644
--- a/tests/qemu-iotests/tests/zoned.out
+++ b/tests/qemu-iotests/tests/zoned.out
@@ -50,4 +50,11 @@ start: 0x8, len 0x8, cap 0x8, wptr 0x10, 
zcond:14, [type: 2]
 (5) resetting the second zone
 After resetting a zone:
 start: 0x8, len 0x8, cap 0x8, wptr 0x8, zcond:1, [type: 2]
+
+
+(6) append write
+After appending the first zone:
+start: 0x0, len 0x8, cap 0x8, wptr 0x18, zcond:2, [type: 2]
+After appending the second zone:
+start: 0x8, len 0x8, cap 0x8, wptr 0x80018, zcond:2, [type: 2]
 *** done
diff --git a/tests/qemu-iotests/tests/zoned.sh 
b/tests/qemu-iotests/tests/zoned.sh
index fced0194c5..888711eef2 100755
--- a/tests/qemu-iotests/tests/zoned.sh
+++ b/tests/qemu-iotests/tests/zoned.sh
@@ -79,6 +79,15 @@ echo "(5) resetting the second zone"
 sudo $QEMU_IO $IMG -c "zrs 268435456 268435456"
 echo "After resetting a zone:"
 sudo $QEMU_IO $IMG -c "zrp 268435456 1"
+echo
+echo
+echo "(6) append write" # physical block size of the device is 4096
+sudo $QEMU_IO $IMG -c "zap 0 0x1000 0x2000"
+echo "After appending the first zone:"
+sudo $QEMU_IO $IMG -c "zrp 0 1"
+sudo $QEMU_IO $IMG -c "zap 268435456 0x1000 0x2000"
+echo "After appending the second zone:"
+sudo $QEMU_IO $IMG -c "zrp 268435456 1"
 
 # success, all done
 echo "*** done"
-- 
2.37.3




[PATCH v4 1/3] file-posix: add the tracking of the zones write pointers

2022-10-16 Thread Sam Li
Since Linux doesn't have a user API to issue zone append operations to
zoned devices from user space, the file-posix driver is modified to add
zone append emulation using regular writes. To do this, the file-posix
driver tracks the wp location of all zones of the device. It uses an
array of uint64_t. The most significant bit of each wp location indicates
if the zone type is conventional zones.

The zones wp can be changed due to the following operations issued:
- zone reset: change the wp to the start offset of that zone
- zone finish: change to the end location of that zone
- write to a zone
- zone append

Signed-off-by: Sam Li 
---
 block/file-posix.c   | 144 +++
 include/block/block-common.h |  14 +++
 include/block/block_int-common.h |   3 +
 3 files changed, 161 insertions(+)

diff --git a/block/file-posix.c b/block/file-posix.c
index 7c5a330fc1..5ff5500301 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1324,6 +1324,66 @@ static int hdev_get_max_segments(int fd, struct stat *st)
 #endif
 }
 
+#if defined(CONFIG_BLKZONED)
+static int get_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
+unsigned int nrz) {
+struct blk_zone *blkz;
+int64_t rep_size;
+int64_t sector = offset >> BDRV_SECTOR_BITS;
+int ret, n = 0, i = 0;
+rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+g_autofree struct blk_zone_report *rep = NULL;
+
+rep = g_malloc(rep_size);
+blkz = (struct blk_zone *)(rep + 1);
+while (n < nrz) {
+memset(rep, 0, rep_size);
+rep->sector = sector;
+rep->nr_zones = nrz - n;
+
+do {
+ret = ioctl(fd, BLKREPORTZONE, rep);
+} while (ret != 0 && errno == EINTR);
+if (ret != 0) {
+error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+fd, offset, errno);
+return -errno;
+}
+
+if (!rep->nr_zones) {
+break;
+}
+
+for (i = 0; i < rep->nr_zones; i++, n++) {
+/*
+ * The wp tracking cares only about sequential writes required and
+ * sequential write preferred zones so that the wp can advance to
+ * the right location.
+ * Use the most significant bit of the wp location to indicate the
+ * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
+ */
+if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
+wps->wp[i] = 1ULL << 63;
+} else {
+wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
+}
+}
+sector = blkz[i - 1].start + blkz[i - 1].len;
+}
+
+return 0;
+}
+
+static void update_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
+unsigned int nrz) {
+qemu_mutex_lock(>lock);
+if (get_zones_wp(fd, wps, offset, nrz) < 0) {
+error_report("update zone wp failed");
+}
+qemu_mutex_unlock(>lock);
+}
+#endif
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
 BDRVRawState *s = bs->opaque;
@@ -1414,6 +1474,14 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 if (ret >= 0) {
 bs->bl.max_active_zones = ret;
 }
+
+bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
+if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
+error_report("report wps failed");
+g_free(bs->bl.wps);
+return;
+}
+qemu_mutex_init(>bl.wps->lock);
 }
 }
 
@@ -1725,6 +1793,25 @@ static int handle_aiocb_rw(void *opaque)
 
 out:
 if (nbytes == aiocb->aio_nbytes) {
+#if defined(CONFIG_BLKZONED)
+if (aiocb->aio_type & QEMU_AIO_WRITE) {
+BlockZoneWps *wps = aiocb->bs->bl.wps;
+int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
+if (wps) {
+qemu_mutex_lock(>lock);
+if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
+uint64_t wend_offset =
+aiocb->aio_offset + aiocb->aio_nbytes;
+
+/* Advance the wp if needed */
+if (wend_offset > wps->wp[index]) {
+wps->wp[index] = wend_offset;
+}
+}
+qemu_mutex_unlock(>lock);
+}
+}
+#endif
 return 0;
 } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
 if (aiocb->aio_type & QEMU_AIO_WRITE) {
@@ -1736,6 +1823,11 @@ out:
 }
 } else {
 assert(nbytes < 0);
+#if defined(CONFIG_BLKZONED)
+if (aiocb->aio_type & QEMU_AIO_WRITE) {
+update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps, 0, 1);
+}
+#endif
 return nbytes;
 }
 }
@@ -2022,14 +2114,29 @@ static int handle_aiocb_zone_report(void *opaque)
 #endif
 
 #if 

[RFC v3 0/2] Add zoned storage emulation to virtio-blk driver

2022-10-16 Thread Sam Li
Note: the virtio-blk headers isn't upstream in the kernel yet therefore
marked as an RFC.

v3:
- use qemuio_from_buffer to write status bit [Stefan]
- avoid using req->elem directly [Stefan]
- fix error checkings and memory leak [Stefan]

v2:
- change units of emulated zone op coresponding to block layer APIs
- modify error checking cases [Stefan, Damien]

v1:
- add zoned storage emulation

Sam Li (2):
  include: update virtio_blk headers from Linux 5.19-rc2+
  virtio-blk: add zoned storage emulation for zoned devices

 hw/block/virtio-blk-common.c|   2 +
 hw/block/virtio-blk.c   | 412 +++-
 include/hw/virtio/virtio-blk.h  |  11 +-
 include/standard-headers/linux/virtio_blk.h | 109 ++
 4 files changed, 531 insertions(+), 3 deletions(-)

-- 
2.37.3




[PATCH v12 6/7] qemu-iotests: test new zone operations

2022-10-16 Thread Sam Li
We have added new block layer APIs of zoned block devices. Test it with:
Create a null_blk device, run each zone operation on it and see
whether reporting right zone information.

Signed-off-by: Sam Li 
Reviewed-by: Stefan Hajnoczi 
---
 tests/qemu-iotests/tests/zoned.out | 53 ++
 tests/qemu-iotests/tests/zoned.sh  | 86 ++
 2 files changed, 139 insertions(+)
 create mode 100644 tests/qemu-iotests/tests/zoned.out
 create mode 100755 tests/qemu-iotests/tests/zoned.sh

diff --git a/tests/qemu-iotests/tests/zoned.out 
b/tests/qemu-iotests/tests/zoned.out
new file mode 100644
index 00..0c8f96deb9
--- /dev/null
+++ b/tests/qemu-iotests/tests/zoned.out
@@ -0,0 +1,53 @@
+QA output created by zoned.sh
+Testing a null_blk device:
+Simple cases: if the operations work
+(1) report the first zone:
+start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:1, [type: 2]
+
+report the first 10 zones
+start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:1, [type: 2]
+start: 0x8, len 0x8, cap 0x8, wptr 0x8, zcond:1, [type: 2]
+start: 0x10, len 0x8, cap 0x8, wptr 0x10, zcond:1, [type: 2]
+start: 0x18, len 0x8, cap 0x8, wptr 0x18, zcond:1, [type: 2]
+start: 0x20, len 0x8, cap 0x8, wptr 0x20, zcond:1, [type: 2]
+start: 0x28, len 0x8, cap 0x8, wptr 0x28, zcond:1, [type: 2]
+start: 0x30, len 0x8, cap 0x8, wptr 0x30, zcond:1, [type: 2]
+start: 0x38, len 0x8, cap 0x8, wptr 0x38, zcond:1, [type: 2]
+start: 0x40, len 0x8, cap 0x8, wptr 0x40, zcond:1, [type: 2]
+start: 0x48, len 0x8, cap 0x8, wptr 0x48, zcond:1, [type: 2]
+
+report the last zone:
+start: 0x1f38, len 0x8, cap 0x8, wptr 0x1f38, zcond:1, [type: 
2]
+
+
+(2) opening the first zone
+report after:
+start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:3, [type: 2]
+
+opening the second zone
+report after:
+start: 0x8, len 0x8, cap 0x8, wptr 0x8, zcond:3, [type: 2]
+
+opening the last zone
+report after:
+start: 0x1f38, len 0x8, cap 0x8, wptr 0x1f38, zcond:3, [type: 
2]
+
+
+(3) closing the first zone
+report after:
+start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:1, [type: 2]
+
+closing the last zone
+report after:
+start: 0x1f38, len 0x8, cap 0x8, wptr 0x1f38, zcond:1, [type: 
2]
+
+
+(4) finishing the second zone
+After finishing a zone:
+start: 0x8, len 0x8, cap 0x8, wptr 0x10, zcond:14, [type: 2]
+
+
+(5) resetting the second zone
+After resetting a zone:
+start: 0x8, len 0x8, cap 0x8, wptr 0x8, zcond:1, [type: 2]
+*** done
diff --git a/tests/qemu-iotests/tests/zoned.sh 
b/tests/qemu-iotests/tests/zoned.sh
new file mode 100755
index 00..fced0194c5
--- /dev/null
+++ b/tests/qemu-iotests/tests/zoned.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+#
+# Test zone management operations.
+#
+
+seq="$(basename $0)"
+echo "QA output created by $seq"
+status=1 # failure is the default!
+
+_cleanup()
+{
+  _cleanup_test_img
+  sudo rmmod null_blk
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.qemu
+
+# This test only runs on Linux hosts with raw image files.
+_supported_fmt raw
+_supported_proto file
+_supported_os Linux
+
+QEMU_IO="build/qemu-io"
+IMG="--image-opts -n driver=zoned_host_device,filename=/dev/nullb0"
+QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
+
+echo "Testing a null_blk device:"
+echo "case 1: if the operations work"
+sudo modprobe null_blk nr_devices=1 zoned=1
+
+echo "(1) report the first zone:"
+sudo $QEMU_IO $IMG -c "zrp 0 1"
+echo
+echo "report the first 10 zones"
+sudo $QEMU_IO $IMG -c "zrp 0 10"
+echo
+echo "report the last zone:"
+sudo $QEMU_IO $IMG -c "zrp 0x3e7000 2" # 0x3e7000 / 512 = 0x1f38
+echo
+echo
+echo "(2) opening the first zone"
+sudo $QEMU_IO $IMG -c "zo 0 268435456"  # 268435456 / 512 = 524288
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zrp 0 1"
+echo
+echo "opening the second zone"
+sudo $QEMU_IO $IMG -c "zo 268435456 268435456" #
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zrp 268435456 1"
+echo
+echo "opening the last zone"
+sudo $QEMU_IO $IMG -c "zo 0x3e7000 268435456"
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zrp 0x3e7000 2"
+echo
+echo
+echo "(3) closing the first zone"
+sudo $QEMU_IO $IMG -c "zc 0 268435456"
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zrp 0 1"
+echo
+echo "closing the last zone"
+sudo $QEMU_IO $IMG -c "zc 0x3e7000 268435456"
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zrp 0x3e7000 2"
+echo
+echo
+echo "(4) finishing the second zone"
+sudo $QEMU_IO $IMG -c "zf 268435456 268435456"
+echo "After finishing a zone:"
+sudo $QEMU_IO $IMG -c "zrp 268435456 1"
+echo
+echo
+echo "(5) resetting the second zone"
+sudo $QEMU_IO $IMG -c "zrs 268435456 268435456"
+echo "After resetting a zone:"
+sudo $QEMU_IO 

[PATCH v4 2/3] block: introduce zone append write for zoned devices

2022-10-16 Thread Sam Li
A zone append command is a write operation that specifies the first
logical block of a zone as the write position. When writing to a zoned
block device using zone append, the byte offset of writes is pointing
to the write pointer of that zone. Upon completion the device will
respond with the position the data has been written in the zone.

Signed-off-by: Sam Li 
---
 block/block-backend.c | 65 ++
 block/file-posix.c| 89 +--
 block/io.c| 21 
 block/raw-format.c|  8 +++
 include/block/block-io.h  |  3 ++
 include/block/block_int-common.h  |  5 ++
 include/block/raw-aio.h   |  4 +-
 include/sysemu/block-backend-io.h |  9 
 8 files changed, 198 insertions(+), 6 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 1c618e9c68..06931ddd24 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
 struct {
 unsigned long op;
 } zone_mgmt;
+struct {
+int64_t *append_sector;
+} zone_append;
 };
 } BlkRwCo;
 
@@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
BlockZoneOp op,
 return >common;
 }
 
+static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
+{
+BlkAioEmAIOCB *acb = opaque;
+BlkRwCo *rwco = >rwco;
+
+rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector,
+   rwco->iobuf, rwco->flags);
+blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+QEMUIOVector *qiov, BdrvRequestFlags flags,
+BlockCompletionFunc *cb, void *opaque) {
+BlkAioEmAIOCB *acb;
+Coroutine *co;
+IO_CODE();
+
+blk_inc_in_flight(blk);
+acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
+acb->rwco = (BlkRwCo) {
+.blk= blk,
+.ret= NOT_DONE,
+.flags  = flags,
+.iobuf  = qiov,
+.zone_append = {
+.append_sector = offset,
+},
+};
+acb->has_returned = false;
+
+co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
+bdrv_coroutine_enter(blk_bs(blk), co);
+acb->has_returned = true;
+if (acb->rwco.ret != NOT_DONE) {
+replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+ blk_aio_complete_bh, acb);
+}
+
+return >common;
+}
+
 /*
  * Send a zone_report command.
  * offset is a byte offset from the start of the device. No alignment
@@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
BlockZoneOp op,
 return ret;
 }
 
+/*
+ * Send a zone_append command.
+ */
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+int ret;
+IO_CODE();
+
+blk_inc_in_flight(blk);
+blk_wait_while_drained(blk);
+if (!blk_is_available(blk)) {
+blk_dec_in_flight(blk);
+return -ENOMEDIUM;
+}
+
+ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
+blk_dec_in_flight(blk);
+return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
 BlockDriverState *bs = blk_bs(blk);
diff --git a/block/file-posix.c b/block/file-posix.c
index 5ff5500301..3d0cc33d02 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -205,6 +205,7 @@ typedef struct RawPosixAIOData {
 struct {
 struct iovec *iov;
 int niov;
+int64_t *offset;
 } io;
 struct {
 uint64_t cmd;
@@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 bs->bl.max_active_zones = ret;
 }
 
+ret = get_sysfs_long_val(, "physical_block_size");
+if (ret >= 0) {
+bs->bl.write_granularity = ret;
+}
+
 bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
 if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
 error_report("report wps failed");
@@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int 
nr_iov, off_t offset)
 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
 {
 ssize_t len;
+BlockZoneWps *wps = aiocb->bs->bl.wps;
+int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
+
+if (wps) {
+qemu_mutex_lock(>lock);
+if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
+aiocb->aio_offset = wps->wp[index];
+}
+}
 
 do {
-if (aiocb->aio_type & QEMU_AIO_WRITE)
+if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
 len = qemu_pwritev(aiocb->aio_fildes,
aiocb->io.iov,
aiocb->io.niov,
@@ -1660,6 +1675,9 

[PATCH v4 0/3] Add zone append write for zoned device

2022-10-16 Thread Sam Li
v4:
- fix lock related issues[Damien]
- drop all field in zone_mgmt op [Damien]
- fix state checks in zong_mgmt command [Damien]
- return start sector of wp when issuing zap req [Damien]

v3:
- only read wps when it is locked [Damien]
- allow last smaller zone case [Damien]
- add zone type and state checks in zone_mgmt command [Damien]
- fix RESET_ALL related problems

v2:
- split patch to two patches for better reviewing
- change BlockZoneWps's structure to an array of integers
- use only mutex lock on locking conditions of zone wps
- coding styles and clean-ups

v1:
- introduce zone append write

Sam Li (3):
  file-posix: add the tracking of the zones write pointers
  block: introduce zone append write for zoned devices
  qemu-iotests: test zone append operation

 block/block-backend.c  |  65 
 block/file-posix.c | 229 -
 block/io.c |  21 +++
 block/raw-format.c |   8 +
 include/block/block-common.h   |  14 ++
 include/block/block-io.h   |   3 +
 include/block/block_int-common.h   |   8 +
 include/block/raw-aio.h|   4 +-
 include/sysemu/block-backend-io.h  |   9 ++
 qemu-io-cmds.c |  63 
 tests/qemu-iotests/tests/zoned.out |   7 +
 tests/qemu-iotests/tests/zoned.sh  |   9 ++
 12 files changed, 436 insertions(+), 4 deletions(-)

-- 
2.37.3




[PATCH v12 7/7] docs/zoned-storage: add zoned device documentation

2022-10-16 Thread Sam Li
Add the documentation about the zoned device support to virtio-blk
emulation.

Signed-off-by: Sam Li 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Damien Le Moal 
---
 docs/devel/zoned-storage.rst   | 43 ++
 docs/system/qemu-block-drivers.rst.inc |  6 
 2 files changed, 49 insertions(+)
 create mode 100644 docs/devel/zoned-storage.rst

diff --git a/docs/devel/zoned-storage.rst b/docs/devel/zoned-storage.rst
new file mode 100644
index 00..cf169d029b
--- /dev/null
+++ b/docs/devel/zoned-storage.rst
@@ -0,0 +1,43 @@
+=
+zoned-storage
+=
+
+Zoned Block Devices (ZBDs) divide the LBA space into block regions called zones
+that are larger than the LBA size. They can only allow sequential writes, which
+can reduce write amplification in SSDs, and potentially lead to higher
+throughput and increased capacity. More details about ZBDs can be found at:
+
+https://zonedstorage.io/docs/introduction/zoned-storage
+
+1. Block layer APIs for zoned storage
+-
+QEMU block layer has three zoned storage model:
+- BLK_Z_HM: The host-managed zoned model only allows sequential writes access
+to zones. It supports ZBD-specific I/O commands that can be used by a host to
+manage the zones of a device.
+- BLK_Z_HA: The host-aware zoned model allows random write operations in
+zones, making it backward compatible with regular block devices.
+- BLK_Z_NONE: The non-zoned model has no zones support. It includes both
+regular and drive-managed ZBD devices. ZBD-specific I/O commands are not
+supported.
+
+The block device information resides inside BlockDriverState. QEMU uses
+BlockLimits struct(BlockDriverState::bl) that is continuously accessed by the
+block layer while processing I/O requests. A BlockBackend has a root pointer to
+a BlockDriverState graph(for example, raw format on top of file-posix). The
+zoned storage information can be propagated from the leaf BlockDriverState all
+the way up to the BlockBackend. If the zoned storage model in file-posix is
+set to BLK_Z_HM, then block drivers will declare support for zoned host device.
+
+The block layer APIs support commands needed for zoned storage devices,
+including report zones, four zone operations, and zone append.
+
+2. Emulating zoned storage controllers
+--
+When the BlockBackend's BlockLimits model reports a zoned storage device, users
+like the virtio-blk emulation or the qemu-io-cmds.c utility can use block layer
+APIs for zoned storage emulation or testing.
+
+For example, to test zone_report on a null_blk device using qemu-io is:
+$ path/to/qemu-io --image-opts -n driver=zoned_host_device,filename=/dev/nullb0
+-c "zrp offset nr_zones"
diff --git a/docs/system/qemu-block-drivers.rst.inc 
b/docs/system/qemu-block-drivers.rst.inc
index dfe5d2293d..0b97227fd9 100644
--- a/docs/system/qemu-block-drivers.rst.inc
+++ b/docs/system/qemu-block-drivers.rst.inc
@@ -430,6 +430,12 @@ Hard disks
   you may corrupt your host data (use the ``-snapshot`` command
   line option or modify the device permissions accordingly).
 
+Zoned block devices
+  Zoned block devices can be passed through to the guest if the emulated 
storage
+  controller supports zoned storage. Use ``--blockdev zoned_host_device,
+  node-name=drive0,filename=/dev/nullb0`` to pass through ``/dev/nullb0``
+  as ``drive0``.
+
 Windows
 ^^^
 
-- 
2.37.3




[PATCH v12 4/7] raw-format: add zone operations to pass through requests

2022-10-16 Thread Sam Li
raw-format driver usually sits on top of file-posix driver. It needs to
pass through requests of zone commands.

Signed-off-by: Sam Li 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Damien Le Moal 
Reviewed-by: Hannes Reinecke 
---
 block/raw-format.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/block/raw-format.c b/block/raw-format.c
index f337ac7569..bac43f1d25 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -314,6 +314,17 @@ static int coroutine_fn raw_co_pdiscard(BlockDriverState 
*bs,
 return bdrv_co_pdiscard(bs->file, offset, bytes);
 }
 
+static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t 
offset,
+   unsigned int *nr_zones,
+   BlockZoneDescriptor *zones) {
+return bdrv_co_zone_report(bs->file->bs, offset, nr_zones, zones);
+}
+
+static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+ int64_t offset, int64_t len) {
+return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
+}
+
 static int64_t raw_getlength(BlockDriverState *bs)
 {
 int64_t len;
@@ -615,6 +626,8 @@ BlockDriver bdrv_raw = {
 .bdrv_co_pwritev  = _co_pwritev,
 .bdrv_co_pwrite_zeroes = _co_pwrite_zeroes,
 .bdrv_co_pdiscard = _co_pdiscard,
+.bdrv_co_zone_report  = _co_zone_report,
+.bdrv_co_zone_mgmt  = _co_zone_mgmt,
 .bdrv_co_block_status = _co_block_status,
 .bdrv_co_copy_range_from = _co_copy_range_from,
 .bdrv_co_copy_range_to  = _co_copy_range_to,
-- 
2.37.3




[PATCH v12 5/7] config: add check to block layer

2022-10-16 Thread Sam Li
Putting zoned/non-zoned BlockDrivers on top of each other is not
allowed.

Signed-off-by: Sam Li 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Hannes Reinecke 
---
 block.c  | 19 +++
 block/file-posix.c   | 12 
 block/raw-format.c   |  1 +
 include/block/block_int-common.h |  5 +
 4 files changed, 37 insertions(+)

diff --git a/block.c b/block.c
index 1fbf6b9e69..5d6fa4a25a 100644
--- a/block.c
+++ b/block.c
@@ -7951,6 +7951,25 @@ void bdrv_add_child(BlockDriverState *parent_bs, 
BlockDriverState *child_bs,
 return;
 }
 
+/*
+ * Non-zoned block drivers do not follow zoned storage constraints
+ * (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
+ * drivers in a graph.
+ */
+if (!parent_bs->drv->supports_zoned_children &&
+child_bs->bl.zoned == BLK_Z_HM) {
+/*
+ * The host-aware model allows zoned storage constraints and random
+ * write. Allow mixing host-aware and non-zoned drivers. Using
+ * host-aware device as a regular device.
+ */
+error_setg(errp, "Cannot add a %s child to a %s parent",
+   child_bs->bl.zoned == BLK_Z_HM ? "zoned" : "non-zoned",
+   parent_bs->drv->supports_zoned_children ?
+   "support zoned children" : "not support zoned children");
+return;
+}
+
 if (!QLIST_EMPTY(_bs->parents)) {
 error_setg(errp, "The node %s already has a parent",
child_bs->node_name);
diff --git a/block/file-posix.c b/block/file-posix.c
index bd28e3eaea..7c5a330fc1 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -776,6 +776,18 @@ static int raw_open_common(BlockDriverState *bs, QDict 
*options,
 goto fail;
 }
 }
+#ifdef CONFIG_BLKZONED
+/*
+ * The kernel page cache does not reliably work for writes to SWR zones
+ * of zoned block device because it can not guarantee the order of writes.
+ */
+if ((strcmp(bs->drv->format_name, "zoned_host_device") == 0) &&
+(!(s->open_flags & O_DIRECT))) {
+error_setg(errp, "driver=zoned_host_device was specified, but it "
+   "requires cache.direct=on, which was not specified.");
+return -EINVAL; /* No host kernel page cache */
+}
+#endif
 
 if (S_ISBLK(st.st_mode)) {
 #ifdef __linux__
diff --git a/block/raw-format.c b/block/raw-format.c
index bac43f1d25..18dc52a150 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -615,6 +615,7 @@ static void raw_child_perm(BlockDriverState *bs, BdrvChild 
*c,
 BlockDriver bdrv_raw = {
 .format_name  = "raw",
 .instance_size= sizeof(BDRVRawState),
+.supports_zoned_children = true,
 .bdrv_probe   = _probe,
 .bdrv_reopen_prepare  = _reopen_prepare,
 .bdrv_reopen_commit   = _reopen_commit,
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index cdc06e77a6..37dddc603c 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -127,6 +127,11 @@ struct BlockDriver {
  */
 bool is_format;
 
+/*
+ * Set to true if the BlockDriver supports zoned children.
+ */
+bool supports_zoned_children;
+
 /*
  * Drivers not implementing bdrv_parse_filename nor bdrv_open should have
  * this field set to true, except ones that are defined only by their
-- 
2.37.3




[PATCH v12 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls

2022-10-16 Thread Sam Li
Add a new zoned_host_device BlockDriver. The zoned_host_device option
accepts only zoned host block devices. By adding zone management
operations in this new BlockDriver, users can use the new block
layer APIs including Report Zone and four zone management operations
(open, close, finish, reset, reset_all).

Qemu-io uses the new APIs to perform zoned storage commands of the device:
zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs),
zone_finish(zf).

For example, to test zone_report, use following command:
$ ./build/qemu-io --image-opts -n driver=zoned_host_device, filename=/dev/nullb0
-c "zrp offset nr_zones"

Signed-off-by: Sam Li 
Reviewed-by: Hannes Reinecke 
---
 block/block-backend.c | 148 +
 block/file-posix.c| 335 ++
 block/io.c|  41 
 include/block/block-io.h  |   7 +
 include/block/block_int-common.h  |  24 +++
 include/block/raw-aio.h   |   6 +-
 include/sysemu/block-backend-io.h |  18 ++
 meson.build   |   4 +
 qapi/block-core.json  |   8 +-
 qemu-io-cmds.c| 149 +
 10 files changed, 737 insertions(+), 3 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index aa4adf06ae..1c618e9c68 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1431,6 +1431,15 @@ typedef struct BlkRwCo {
 void *iobuf;
 int ret;
 BdrvRequestFlags flags;
+union {
+struct {
+unsigned int *nr_zones;
+BlockZoneDescriptor *zones;
+} zone_report;
+struct {
+unsigned long op;
+} zone_mgmt;
+};
 } BlkRwCo;
 
 int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
@@ -1775,6 +1784,145 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
 return ret;
 }
 
+static void coroutine_fn blk_aio_zone_report_entry(void *opaque)
+{
+BlkAioEmAIOCB *acb = opaque;
+BlkRwCo *rwco = >rwco;
+
+rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
+   rwco->zone_report.nr_zones,
+   rwco->zone_report.zones);
+blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
+unsigned int *nr_zones,
+BlockZoneDescriptor  *zones,
+BlockCompletionFunc *cb, void *opaque)
+{
+BlkAioEmAIOCB *acb;
+Coroutine *co;
+IO_CODE();
+
+blk_inc_in_flight(blk);
+acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
+acb->rwco = (BlkRwCo) {
+.blk= blk,
+.offset = offset,
+.ret= NOT_DONE,
+.zone_report = {
+.zones = zones,
+.nr_zones = nr_zones,
+},
+};
+acb->has_returned = false;
+
+co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
+bdrv_coroutine_enter(blk_bs(blk), co);
+
+acb->has_returned = true;
+if (acb->rwco.ret != NOT_DONE) {
+replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+ blk_aio_complete_bh, acb);
+}
+
+return >common;
+}
+
+static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque)
+{
+BlkAioEmAIOCB *acb = opaque;
+BlkRwCo *rwco = >rwco;
+
+rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op,
+ rwco->offset, acb->bytes);
+blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+  int64_t offset, int64_t len,
+  BlockCompletionFunc *cb, void *opaque) {
+BlkAioEmAIOCB *acb;
+Coroutine *co;
+IO_CODE();
+
+blk_inc_in_flight(blk);
+acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
+acb->rwco = (BlkRwCo) {
+.blk= blk,
+.offset = offset,
+.ret= NOT_DONE,
+.zone_mgmt = {
+.op = op,
+},
+};
+acb->bytes = len;
+acb->has_returned = false;
+
+co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
+bdrv_coroutine_enter(blk_bs(blk), co);
+
+acb->has_returned = true;
+if (acb->rwco.ret != NOT_DONE) {
+replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+ blk_aio_complete_bh, acb);
+}
+
+return >common;
+}
+
+/*
+ * Send a zone_report command.
+ * offset is a byte offset from the start of the device. No alignment
+ * required for offset.
+ * nr_zones represents IN maximum and OUT actual.
+ */
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+unsigned int *nr_zones,
+BlockZoneDescriptor *zones)
+{
+int ret;
+IO_CODE();
+
+blk_inc_in_flight(blk); /* increase before waiting */
+blk_wait_while_drained(blk);
+if 

[PATCH v12 2/7] file-posix: introduce helper functions for sysfs attributes

2022-10-16 Thread Sam Li
Use get_sysfs_str_val() to get the string value of device
zoned model. Then get_sysfs_zoned_model() can convert it to
BlockZoneModel type of QEMU.

Use get_sysfs_long_val() to get the long value of zoned device
information.

Signed-off-by: Sam Li 
Reviewed-by: Hannes Reinecke 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Damien Le Moal 
---
 block/file-posix.c   | 124 ++-
 include/block/block_int-common.h |   3 +
 2 files changed, 91 insertions(+), 36 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 23acffb9a4..8cb07fdb8a 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1201,66 +1201,112 @@ static int hdev_get_max_hw_transfer(int fd, struct 
stat *st)
 #endif
 }
 
-static int hdev_get_max_segments(int fd, struct stat *st)
-{
+/*
+ * Get a sysfs attribute value as character string.
+ */
+static int get_sysfs_str_val(struct stat *st, const char *attribute,
+ char **val) {
 #ifdef CONFIG_LINUX
-char buf[32];
-const char *end;
-char *sysfspath = NULL;
+g_autofree char *sysfspath = NULL;
 int ret;
-int sysfd = -1;
-long max_segments;
+size_t len;
 
-if (S_ISCHR(st->st_mode)) {
-if (ioctl(fd, SG_GET_SG_TABLESIZE, ) == 0) {
-return ret;
-}
+if (!S_ISBLK(st->st_mode)) {
 return -ENOTSUP;
 }
 
-if (!S_ISBLK(st->st_mode)) {
-return -ENOTSUP;
+sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
+major(st->st_rdev), minor(st->st_rdev),
+attribute);
+ret = g_file_get_contents(sysfspath, val, , NULL);
+if (ret == -1) {
+return -ENOENT;
 }
 
-sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
-major(st->st_rdev), minor(st->st_rdev));
-sysfd = open(sysfspath, O_RDONLY);
-if (sysfd == -1) {
-ret = -errno;
-goto out;
+/* The file is ended with '\n' */
+char *p;
+p = *val;
+if (*(p + len - 1) == '\n') {
+*(p + len - 1) = '\0';
 }
-do {
-ret = read(sysfd, buf, sizeof(buf) - 1);
-} while (ret == -1 && errno == EINTR);
+return ret;
+#else
+return -ENOTSUP;
+#endif
+}
+
+static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
+{
+g_autofree char *val = NULL;
+int ret;
+
+ret = get_sysfs_str_val(st, "zoned", );
 if (ret < 0) {
-ret = -errno;
-goto out;
-} else if (ret == 0) {
-ret = -EIO;
-goto out;
+return ret;
 }
-buf[ret] = 0;
-/* The file is ended with '\n', pass 'end' to accept that. */
-ret = qemu_strtol(buf, , 10, _segments);
-if (ret == 0 && end && *end == '\n') {
-ret = max_segments;
+
+if (strcmp(val, "host-managed") == 0) {
+*zoned = BLK_Z_HM;
+} else if (strcmp(val, "host-aware") == 0) {
+*zoned = BLK_Z_HA;
+} else if (strcmp(val, "none") == 0) {
+*zoned = BLK_Z_NONE;
+} else {
+return -ENOTSUP;
 }
+return 0;
+}
 
-out:
-if (sysfd != -1) {
-close(sysfd);
+/*
+ * Get a sysfs attribute value as a long integer.
+ */
+static long get_sysfs_long_val(struct stat *st, const char *attribute)
+{
+#ifdef CONFIG_LINUX
+g_autofree char *str = NULL;
+const char *end;
+long val;
+int ret;
+
+ret = get_sysfs_str_val(st, attribute, );
+if (ret < 0) {
+return ret;
+}
+
+/* The file is ended with '\n', pass 'end' to accept that. */
+ret = qemu_strtol(str, , 10, );
+if (ret == 0 && end && *end == '\0') {
+ret = val;
 }
-g_free(sysfspath);
 return ret;
 #else
 return -ENOTSUP;
 #endif
 }
 
+static int hdev_get_max_segments(int fd, struct stat *st)
+{
+#ifdef CONFIG_LINUX
+int ret;
+
+if (S_ISCHR(st->st_mode)) {
+if (ioctl(fd, SG_GET_SG_TABLESIZE, ) == 0) {
+return ret;
+}
+return -ENOTSUP;
+}
+return get_sysfs_long_val(st, "max_segments");
+#else
+return -ENOTSUP;
+#endif
+}
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
 BDRVRawState *s = bs->opaque;
 struct stat st;
+int ret;
+BlockZoneModel zoned;
 
 s->needs_alignment = raw_needs_alignment(bs);
 raw_probe_alignment(bs, s->fd, errp);
@@ -1298,6 +1344,12 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 bs->bl.max_hw_iov = ret;
 }
 }
+
+ret = get_sysfs_zoned_model(, );
+if (ret < 0) {
+zoned = BLK_Z_NONE;
+}
+bs->bl.zoned = zoned;
 }
 
 static int check_for_dasd(int fd)
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 8947abab76..7f7863cc9e 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -825,6 +825,9 @@ typedef struct BlockLimits {
 
 /* maximum number of iovec elements */
   

[PATCH v12 1/7] include: add zoned device structs

2022-10-16 Thread Sam Li
Signed-off-by: Sam Li 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Damien Le Moal 
Reviewed-by: Hannes Reinecke 
---
 include/block/block-common.h | 43 
 1 file changed, 43 insertions(+)

diff --git a/include/block/block-common.h b/include/block/block-common.h
index fdb7306e78..36bd0e480e 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -49,6 +49,49 @@ typedef struct BlockDriver BlockDriver;
 typedef struct BdrvChild BdrvChild;
 typedef struct BdrvChildClass BdrvChildClass;
 
+typedef enum BlockZoneOp {
+BLK_ZO_OPEN,
+BLK_ZO_CLOSE,
+BLK_ZO_FINISH,
+BLK_ZO_RESET,
+} BlockZoneOp;
+
+typedef enum BlockZoneModel {
+BLK_Z_NONE = 0x0, /* Regular block device */
+BLK_Z_HM = 0x1, /* Host-managed zoned block device */
+BLK_Z_HA = 0x2, /* Host-aware zoned block device */
+} BlockZoneModel;
+
+typedef enum BlockZoneCondition {
+BLK_ZS_NOT_WP = 0x0,
+BLK_ZS_EMPTY = 0x1,
+BLK_ZS_IOPEN = 0x2,
+BLK_ZS_EOPEN = 0x3,
+BLK_ZS_CLOSED = 0x4,
+BLK_ZS_RDONLY = 0xD,
+BLK_ZS_FULL = 0xE,
+BLK_ZS_OFFLINE = 0xF,
+} BlockZoneCondition;
+
+typedef enum BlockZoneType {
+BLK_ZT_CONV = 0x1, /* Conventional random writes supported */
+BLK_ZT_SWR = 0x2, /* Sequential writes required */
+BLK_ZT_SWP = 0x3, /* Sequential writes preferred */
+} BlockZoneType;
+
+/*
+ * Zone descriptor data structure.
+ * Provides information on a zone with all position and size values in bytes.
+ */
+typedef struct BlockZoneDescriptor {
+uint64_t start;
+uint64_t length;
+uint64_t cap;
+uint64_t wp;
+BlockZoneType type;
+BlockZoneCondition cond;
+} BlockZoneDescriptor;
+
 typedef struct BlockDriverInfo {
 /* in bytes, 0 if irrelevant */
 int cluster_size;
-- 
2.37.3




[PATCH v12 0/7] Add support for zoned device

2022-10-16 Thread Sam Li
Zoned Block Devices (ZBDs) devide the LBA space to block regions called zones
that are larger than the LBA size. It can only allow sequential writes, which
reduces write amplification in SSD, leading to higher throughput and increased
capacity. More details about ZBDs can be found at:

https://zonedstorage.io/docs/introduction/zoned-storage

The zoned device support aims to let guests (virtual machines) access zoned
storage devices on the host (hypervisor) through a virtio-blk device. This
involves extending QEMU's block layer and virtio-blk emulation code.  In its
current status, the virtio-blk device is not aware of ZBDs but the guest sees
host-managed drives as regular drive that will runs correctly under the most
common write workloads.

This patch series extend the block layer APIs with the minimum set of zoned
commands that are necessary to support zoned devices. The commands are - Report
Zones, four zone operations and Zone Append (developing).

It can be tested on a null_blk device using qemu-io or qemu-iotests. For
example, to test zone report using qemu-io:
$ path/to/qemu-io --image-opts -n driver=zoned_host_device,filename=/dev/nullb0
-c "zrp offset nr_zones"

v12:
- address review comments
  * drop BLK_ZO_RESET_ALL bit [Damien]
  * fix error messages, style, and typos[Damien, Hannes]

v11:
- address review comments
  * fix possible BLKZONED config compiling warnings [Stefan]
  * fix capacity field compiling warnings on older kernel [Stefan,Damien]

v10:
- address review comments
  * deal with the last small zone case in zone_mgmt operations [Damien]
  * handle the capacity field outdated in old kernel(before 5.9) [Damien]
  * use byte unit in block layer to be consistent with QEMU [Eric]
  * fix coding style related problems [Stefan]

v9:
- address review comments
  * specify units of zone commands requests [Stefan]
  * fix some error handling in file-posix [Stefan]
  * introduce zoned_host_devcie in the commit message [Markus]

v8:
- address review comments
  * solve patch conflicts and merge sysfs helper funcations into one patch
  * add cache.direct=on check in config

v7:
- address review comments
  * modify sysfs attribute helper funcations
  * move the input validation and error checking into raw_co_zone_* function
  * fix checks in config

v6:
- drop virtio-blk emulation changes
- address Stefan's review comments
  * fix CONFIG_BLKZONED configs in related functions
  * replace reading fd by g_file_get_contents() in get_sysfs_str_val()
  * rewrite documentation for zoned storage

v5:
- add zoned storage emulation to virtio-blk device
- add documentation for zoned storage
- address review comments
  * fix qemu-iotests
  * fix check to block layer
  * modify interfaces of sysfs helper functions
  * rename zoned device structs according to QEMU styles
  * reorder patches

v4:
- add virtio-blk headers for zoned device
- add configurations for zoned host device
- add zone operations for raw-format
- address review comments
  * fix memory leak bug in zone_report
  * add checks to block layers
  * fix qemu-iotests format
  * fix sysfs helper functions

v3:
- add helper functions to get sysfs attributes
- address review comments
  * fix zone report bugs
  * fix the qemu-io code path
  * use thread pool to avoid blocking ioctl() calls

v2:
- add qemu-io sub-commands
- address review comments
  * modify interfaces of APIs

v1:
- add block layer APIs resembling Linux ZoneBlockDevice ioctls

Sam Li (7):
  include: add zoned device structs
  file-posix: introduce helper functions for sysfs attributes
  block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
  raw-format: add zone operations to pass through requests
  config: add check to block layer
  qemu-iotests: test new zone operations
  docs/zoned-storage: add zoned device documentation

 block.c|  19 +
 block/block-backend.c  | 148 
 block/file-posix.c | 471 +++--
 block/io.c |  41 +++
 block/raw-format.c |  14 +
 docs/devel/zoned-storage.rst   |  43 +++
 docs/system/qemu-block-drivers.rst.inc |   6 +
 include/block/block-common.h   |  43 +++
 include/block/block-io.h   |   7 +
 include/block/block_int-common.h   |  32 ++
 include/block/raw-aio.h|   6 +-
 include/sysemu/block-backend-io.h  |  18 +
 meson.build|   4 +
 qapi/block-core.json   |   8 +-
 qemu-io-cmds.c | 149 
 tests/qemu-iotests/tests/zoned.out |  53 +++
 tests/qemu-iotests/tests/zoned.sh  |  86 +
 17 files changed, 1109 insertions(+), 39 deletions(-)
 create mode 100644 docs/devel/zoned-storage.rst
 create mode 100644 tests/qemu-iotests/tests/zoned.out
 create mode 100755 tests/qemu-iotests/tests/zoned.sh

-- 
2.37.3




Re: [PATCH v3 7/9] hw/ppc/e500: Implement pflash handling

2022-10-16 Thread BALATON Zoltan

On Sun, 16 Oct 2022, Bernhard Beschow wrote:

Allows e500 boards to have their root file system reside on flash using
only builtin devices located in the eLBC memory region.

Note that the flash memory area is only created when a -pflash argument is
given, and that the size is determined by the given file. The idea is to
put users into control.

Signed-off-by: Bernhard Beschow 
---
docs/system/ppc/ppce500.rst | 16 ++
hw/ppc/Kconfig  |  1 +
hw/ppc/e500.c   | 62 +
3 files changed, 79 insertions(+)

diff --git a/docs/system/ppc/ppce500.rst b/docs/system/ppc/ppce500.rst
index ba6bcb7314..99d2c680d6 100644
--- a/docs/system/ppc/ppce500.rst
+++ b/docs/system/ppc/ppce500.rst
@@ -165,3 +165,19 @@ if “-device eTSEC” is given to QEMU:
.. code-block:: bash

  -netdev tap,ifname=tap0,script=no,downscript=no,id=net0 -device 
eTSEC,netdev=net0
+
+Root file system on flash drive
+---
+
+Rather than using a root file system on ram disk, it is possible to have it on
+CFI flash. Given an ext2 image whose size must be a power of two, it can be 
used
+as follows:
+
+.. code-block:: bash
+
+  $ qemu-system-ppc{64|32} -M ppce500 -cpu e500mc -smp 4 -m 2G \


We have qemu-system-ppc and qemu-system-ppc64 not qemu-system-ppc32 so 
maybe qemu-system-ppc[64] even though that looks odd so maybe just 
qemu-system-ppc and then people should know that ppc64 includes ppc config 
as well.


Regards,
BALATON Zoltan


+  -display none -serial stdio \
+  -kernel vmlinux \
+  -drive if=pflash,file=/path/to/rootfs.ext2,format=raw \
+  -append "rootwait root=/dev/mtdblock0"
+
diff --git a/hw/ppc/Kconfig b/hw/ppc/Kconfig
index 791fe78a50..769a1ead1c 100644
--- a/hw/ppc/Kconfig
+++ b/hw/ppc/Kconfig
@@ -126,6 +126,7 @@ config E500
select ETSEC
select GPIO_MPC8XXX
select OPENPIC
+select PFLASH_CFI01
select PLATFORM_BUS
select PPCE500_PCI
select SERIAL
diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c
index 3e950ea3ba..23d2c3451a 100644
--- a/hw/ppc/e500.c
+++ b/hw/ppc/e500.c
@@ -23,8 +23,10 @@
#include "e500-ccsr.h"
#include "net/net.h"
#include "qemu/config-file.h"
+#include "hw/block/flash.h"
#include "hw/char/serial.h"
#include "hw/pci/pci.h"
+#include "sysemu/block-backend-io.h"
#include "sysemu/sysemu.h"
#include "sysemu/kvm.h"
#include "sysemu/reset.h"
@@ -267,6 +269,31 @@ static void sysbus_device_create_devtree(SysBusDevice 
*sbdev, void *opaque)
}
}

+static void create_devtree_flash(SysBusDevice *sbdev,
+ PlatformDevtreeData *data)
+{
+g_autofree char *name = NULL;
+uint64_t num_blocks = object_property_get_uint(OBJECT(sbdev),
+   "num-blocks",
+   _fatal);
+uint64_t sector_length = object_property_get_uint(OBJECT(sbdev),
+  "sector-length",
+  _fatal);
+uint64_t bank_width = object_property_get_uint(OBJECT(sbdev),
+   "width",
+   _fatal);
+hwaddr flashbase = 0;
+hwaddr flashsize = num_blocks * sector_length;
+void *fdt = data->fdt;
+
+name = g_strdup_printf("%s/nor@%" PRIx64, data->node, flashbase);
+qemu_fdt_add_subnode(fdt, name);
+qemu_fdt_setprop_string(fdt, name, "compatible", "cfi-flash");
+qemu_fdt_setprop_sized_cells(fdt, name, "reg",
+ 1, flashbase, 1, flashsize);
+qemu_fdt_setprop_cell(fdt, name, "bank-width", bank_width);
+}
+
static void platform_bus_create_devtree(PPCE500MachineState *pms,
void *fdt, const char *mpic)
{
@@ -276,6 +303,8 @@ static void platform_bus_create_devtree(PPCE500MachineState 
*pms,
uint64_t addr = pmc->platform_bus_base;
uint64_t size = pmc->platform_bus_size;
int irq_start = pmc->platform_bus_first_irq;
+SysBusDevice *sbdev;
+bool ambiguous;

/* Create a /platform node that we can put all devices into */

@@ -302,6 +331,13 @@ static void 
platform_bus_create_devtree(PPCE500MachineState *pms,
/* Loop through all dynamic sysbus devices and create nodes for them */
foreach_dynamic_sysbus_device(sysbus_device_create_devtree, );

+sbdev = SYS_BUS_DEVICE(object_resolve_path_type("", TYPE_PFLASH_CFI01,
+));
+if (sbdev) {
+assert(!ambiguous);
+create_devtree_flash(sbdev, );
+}
+
g_free(node);
}

@@ -856,6 +892,7 @@ void ppce500_init(MachineState *machine)
unsigned int pci_irq_nrs[PCI_NUM_PINS] = {1, 2, 3, 4};
IrqLines *irqs;
DeviceState *dev, *mpicdev;
+DriveInfo *dinfo;
CPUPPCState *firstenv = NULL;
MemoryRegion *ccsr_addr_space;
SysBusDevice *s;
@@ -1024,6 +1061,31 @@ void 

Re: [PATCH v6 2/2] block: Refactor get_tmp_filename()

2022-10-16 Thread Bin Meng
On Mon, Oct 10, 2022 at 12:05 PM Bin Meng  wrote:
>
> At present there are two callers of get_tmp_filename() and they are
> inconsistent.
>
> One does:
>
> /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
> char *tmp_filename = g_malloc0(PATH_MAX + 1);
> ...
> ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
>
> while the other does:
>
> s->qcow_filename = g_malloc(PATH_MAX);
> ret = get_tmp_filename(s->qcow_filename, PATH_MAX);
>
> As we can see different 'size' arguments are passed. There are also
> platform specific implementations inside the function, and the use
> of snprintf is really undesirable.
>
> The function name is also misleading. It creates a temporary file,
> not just a filename.
>
> Refactor this routine by changing its name and signature to:
>
> char *create_tmp_file(Error **errp)
>
> and use g_get_tmp_dir() / g_mkstemp() for a consistent implementation.
>
> While we are here, add some comments to mention that /var/tmp is
> preferred over /tmp on non-win32 hosts.
>
> Signed-off-by: Bin Meng 
> ---
>
> Changes in v6:
> - use g_mkstemp() and stick to use /var/tmp for non-win32 hosts
>
> Changes in v5:
> - minor change in the commit message
> - add some notes in the function comment block
> - add g_autofree for tmp_filename
>
> Changes in v4:
> - Rename the function to create_tmp_file() and take "Error **errp" as
>   a parameter, so that callers can pass errp all the way down to this
>   routine.
> - Commit message updated to reflect the latest change
>
> Changes in v3:
> - Do not use errno directly, instead still let get_tmp_filename() return
>   a negative number to indicate error
>
> Changes in v2:
> - Use g_autofree and g_steal_pointer
>
>  include/block/block_int-common.h |  2 +-
>  block.c  | 56 +---
>  block/vvfat.c|  7 ++--
>  3 files changed, 34 insertions(+), 31 deletions(-)
>

Any comments?



[PATCH v3 9/9] hw/ppc/e500: Add Freescale eSDHC to e500plat

2022-10-16 Thread Bernhard Beschow
Adds missing functionality to e500plat machine which increases the
chance of given "real" firmware images to access SD cards.

Signed-off-by: Bernhard Beschow 
---
 docs/system/ppc/ppce500.rst | 12 
 hw/ppc/Kconfig  |  1 +
 hw/ppc/e500.c   | 35 ++-
 hw/ppc/e500.h   |  1 +
 hw/ppc/e500plat.c   |  1 +
 5 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/docs/system/ppc/ppce500.rst b/docs/system/ppc/ppce500.rst
index 99d2c680d6..298ee9ee16 100644
--- a/docs/system/ppc/ppce500.rst
+++ b/docs/system/ppc/ppce500.rst
@@ -19,6 +19,7 @@ The ``ppce500`` machine supports the following devices:
 * Power-off functionality via one GPIO pin
 * 1 Freescale MPC8xxx PCI host controller
 * VirtIO devices via PCI bus
+* 1 Freescale Enhanced Secure Digital Host controller (eSDHC)
 * 1 Freescale Enhanced Triple Speed Ethernet controller (eTSEC)
 
 Hardware configuration information
@@ -181,3 +182,14 @@ as follows:
   -drive if=pflash,file=/path/to/rootfs.ext2,format=raw \
   -append "rootwait root=/dev/mtdblock0"
 
+Alternatively, the root file system can also reside on an emulated SD card
+whose size must again be a power of two:
+
+.. code-block:: bash
+
+  $ qemu-system-ppc{64|32} -M ppce500 -cpu e500mc -smp 4 -m 2G \
+  -display none -serial stdio \
+  -kernel vmlinux \
+  -device sd-card,drive=mydrive \
+  -drive id=mydrive,if=none,file=/path/to/rootfs.ext2,format=raw \
+  -append "rootwait root=/dev/mmcblk0"
diff --git a/hw/ppc/Kconfig b/hw/ppc/Kconfig
index 769a1ead1c..6e31f568ba 100644
--- a/hw/ppc/Kconfig
+++ b/hw/ppc/Kconfig
@@ -129,6 +129,7 @@ config E500
 select PFLASH_CFI01
 select PLATFORM_BUS
 select PPCE500_PCI
+select SDHCI
 select SERIAL
 select MPC_I2C
 select FDT_PPC
diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c
index 23d2c3451a..f43a21d8bb 100644
--- a/hw/ppc/e500.c
+++ b/hw/ppc/e500.c
@@ -48,6 +48,7 @@
 #include "hw/net/fsl_etsec/etsec.h"
 #include "hw/i2c/i2c.h"
 #include "hw/irq.h"
+#include "hw/sd/sdhci.h"
 
 #define EPAPR_MAGIC(0x45504150)
 #define DTC_LOAD_PAD   0x180
@@ -66,11 +67,14 @@
 #define MPC8544_SERIAL1_REGS_OFFSET 0x4600ULL
 #define MPC8544_PCI_REGS_OFFSET0x8000ULL
 #define MPC8544_PCI_REGS_SIZE  0x1000ULL
+#define MPC85XX_ESDHC_REGS_OFFSET  0x2e000ULL
+#define MPC85XX_ESDHC_REGS_SIZE0x1000ULL
 #define MPC8544_UTIL_OFFSET0xeULL
 #define MPC8XXX_GPIO_OFFSET0x000FF000ULL
 #define MPC8544_I2C_REGS_OFFSET0x3000ULL
 #define MPC8XXX_GPIO_IRQ   47
 #define MPC8544_I2C_IRQ43
+#define MPC85XX_ESDHC_IRQ  72
 #define RTC_REGS_OFFSET0x68
 
 #define PLATFORM_CLK_FREQ_HZ   (400 * 1000 * 1000)
@@ -203,6 +207,22 @@ static void dt_i2c_create(void *fdt, const char *soc, 
const char *mpic,
 g_free(i2c);
 }
 
+static void dt_sdhc_create(void *fdt, const char *parent, const char *mpic)
+{
+hwaddr mmio = MPC85XX_ESDHC_REGS_OFFSET;
+hwaddr size = MPC85XX_ESDHC_REGS_SIZE;
+int irq = MPC85XX_ESDHC_IRQ;
+g_autofree char *name = NULL;
+
+name = g_strdup_printf("%s/sdhc@%" PRIx64, parent, mmio);
+qemu_fdt_add_subnode(fdt, name);
+qemu_fdt_setprop(fdt, name, "sdhci,auto-cmd12", NULL, 0);
+qemu_fdt_setprop_phandle(fdt, name, "interrupt-parent", mpic);
+qemu_fdt_setprop_cells(fdt, name, "bus-width", 4);
+qemu_fdt_setprop_cells(fdt, name, "interrupts", irq, 0x2);
+qemu_fdt_setprop_cells(fdt, name, "reg", mmio, size);
+qemu_fdt_setprop_string(fdt, name, "compatible", "fsl,esdhc");
+}
 
 typedef struct PlatformDevtreeData {
 void *fdt;
@@ -553,6 +573,10 @@ static int ppce500_load_device_tree(PPCE500MachineState 
*pms,
 
 dt_rtc_create(fdt, "i2c", "rtc");
 
+/* sdhc */
+if (pmc->has_esdhc) {
+dt_sdhc_create(fdt, soc, mpic);
+}
 
 gutil = g_strdup_printf("%s/global-utilities@%llx", soc,
 MPC8544_UTIL_OFFSET);
@@ -982,7 +1006,8 @@ void ppce500_init(MachineState *machine)
0, qdev_get_gpio_in(mpicdev, 42), 399193,
serial_hd(1), DEVICE_BIG_ENDIAN);
 }
-/* I2C */
+
+/* I2C */
 dev = qdev_new("mpc-i2c");
 s = SYS_BUS_DEVICE(dev);
 sysbus_realize_and_unref(s, _fatal);
@@ -992,6 +1017,14 @@ void ppce500_init(MachineState *machine)
 i2c = (I2CBus *)qdev_get_child_bus(dev, "i2c");
 i2c_slave_create_simple(i2c, "ds1338", RTC_REGS_OFFSET);
 
+/* eSDHC */
+if (pmc->has_esdhc) {
+dev = qdev_new(TYPE_FSL_ESDHC);
+s = SYS_BUS_DEVICE(dev);
+sysbus_realize_and_unref(s, _fatal);
+sysbus_mmio_map(s, 0, pmc->ccsrbar_base + MPC85XX_ESDHC_REGS_OFFSET);
+sysbus_connect_irq(s, 0, qdev_get_gpio_in(mpicdev, MPC85XX_ESDHC_IRQ));
+}
 
 /* General Utility device */
 dev = qdev_new("mpc8544-guts");
diff --git a/hw/ppc/e500.h b/hw/ppc/e500.h
index 

Re: [PATCH v3 0/9] ppc/e500: Add support for two types of flash, cleanup

2022-10-16 Thread Bernhard Beschow
Am 16. Oktober 2022 12:27:28 UTC schrieb Bernhard Beschow :
>Cover letter:
>
>~
>
>
>
>This series adds support for -pflash and direct SD card access to the
>
>PPC e500 boards. The idea is to increase compatibility with "real" firmware
>
>images where only the bare minimum of drivers is compiled in.
>
>
>
>The series is structured as follows:
>
>
>
>Patches 1-6 perform some general cleanup which paves the way for the rest of
>
>the series.
>
>
>
>Patch 7 adds -pflash handling where memory-mapped flash can be added on
>
>user's behalf. That is, the flash memory region in the eLBC is only added if
>
>the -pflash argument is supplied. Note that the cfi01 device model becomes
>
>stricter in checking the size of the emulated flash space.
>
>
>
>Patches 8 and 9 add a new device model - the Freescale eSDHC - to the e500
>
>boards which was missing so far.
>
>
>
>User documentation is also added as the new features become available.
>
>
>
>Tesing done:
>
>* `qemu-system-ppc -M ppce500 -cpu e500mc -m 256 -kernel uImage -append
>
>"console=ttyS0 rootwait root=/dev/mtdblock0 nokaslr" -drive
>
>if=pflash,file=rootfs.ext2,format=raw`
>
>* `qemu-system-ppc -M ppce500 -cpu e500mc -m 256 -kernel uImage -append
>
>"console=ttyS0 rootwait root=/dev/mmcblk0" -device sd-card,drive=mydrive -drive
>
>id=mydrive,if=none,file=rootfs.ext2,format=raw`
>
>
>
>The load was created using latest Buildroot with `make
>
>qemu_ppc_e500mc_defconfig` where the rootfs was configured to be of ext2 type.
>
>In both cases it was possible to log in and explore the root file system.
>
>
>
>v3:
>
>~~~
>
>Phil:
>
>- Also add power-of-2 fix to pflash_cfi02
>
>- Resolve cfi01-specific assertion in e500 code
>
>- Resolve unused define in eSDHC device model
>
>- Resolve redundant alignment checks in eSDHC device model
>
>
>
>Bin:
>
>- Add dedicated flash chapter to documentation
>
>
>
>Bernhard:
>
>- Use is_power_of_2() instead of ctpop64() for better readability
>
>- Only instantiate eSDHC device model in ppce500 (not used in MPC8544DS)
>
>- Rebase onto gitlab.com/danielhb/qemu/tree/ppc-next
- Move cfi0x memory region setup into board code to avoid cfi01-specific 
assertion there
- While at it, resolve unreachable code related to cfi01 device creation
- Reorder patches such that trivial patches come first

Best regards,
Bernhard

>
>
>
>v2:
>
>~~~
>
>Bin:
>
>- Add source for MPC8544DS platform bus' memory map in commit message.
>
>- Keep "ESDHC" in comment referring to Linux driver.
>
>- Use "qemu-system-ppc{64|32} in documentation.
>
>- Use g_autofree in device tree code.
>
>- Remove unneeded device tree properties.
>
>- Error out if pflash size doesn't fit into eLBC memory window.
>
>- Remove unused ESDHC defines.
>
>- Define macro ESDHC_WML for register offset with magic constant.
>
>- Fix some whitespace issues when adding eSDHC device to e500.
>
>
>
>Phil:
>
>- Fix tense in commit message.
>
>
>
>Bernhard Beschow (9):
>
>  hw/block/pflash_cfi0{1,2}: Error out if device length isn't a power of
>
>two
>
>  hw/{arm,ppc}: Resolve unreachable code
>
>  hw/block/pflash_cfi01: Attach memory region in boards
>
>  hw/block/pflash_cfi02: Attach memory region in boards
>
>  hw/sd/sdhci-internal: Unexport ESDHC defines
>
>  hw/sd/sdhci: Rename ESDHC_* defines to USDHC_*
>
>  hw/ppc/e500: Implement pflash handling
>
>  hw/sd/sdhci: Implement Freescale eSDHC device model
>
>  hw/ppc/e500: Add Freescale eSDHC to e500plat
>
>
>
> docs/system/ppc/ppce500.rst  |  28 
>
> hw/arm/collie.c  |  20 ++-
>
> hw/arm/digic_boards.c|  16 +-
>
> hw/arm/gumstix.c |  24 +--
>
> hw/arm/mainstone.c   |  15 +-
>
> hw/arm/musicpal.c|  15 +-
>
> hw/arm/omap_sx1.c|  25 ++--
>
> hw/arm/versatilepb.c |  14 +-
>
> hw/arm/xilinx_zynq.c |  12 +-
>
> hw/arm/z2.c  |  12 +-
>
> hw/block/pflash_cfi01.c  |  12 +-
>
> hw/block/pflash_cfi02.c  |  14 +-
>
> hw/microblaze/petalogix_ml605_mmu.c  |  16 +-
>
> hw/microblaze/petalogix_s3adsp1800_mmu.c |  10 +-
>
> hw/mips/malta.c  |   4 +-
>
> hw/ppc/Kconfig   |   2 +
>
> hw/ppc/e500.c|  97 +++-
>
> hw/ppc/e500.h|   1 +
>
> hw/ppc/e500plat.c|   1 +
>
> hw/ppc/sam460ex.c|  19 ++-
>
> hw/ppc/virtex_ml507.c|   5 +-
>
> hw/sd/sdhci-internal.h   |  20 ---
>
> hw/sd/sdhci.c| 183 ---
>
> hw/sh4/r2d.c |  11 +-
>
> include/hw/block/flash.h |   7 +-
>
> include/hw/sd/sdhci.h|   3 +
>
> 26 files changed, 433 insertions(+), 153 deletions(-)
>
>
>
>-- >
>2.38.0
>
>
>




Re: [PATCH v2 13/13] hw/ppc/e500: Add Freescale eSDHC to e500 boards

2022-10-16 Thread Bernhard Beschow
Am 3. Oktober 2022 21:06:57 UTC schrieb "Philippe Mathieu-Daudé" 
:
>On 3/10/22 22:31, Bernhard Beschow wrote:
>> Adds missing functionality to emulated e500 SOCs which increases the
>> chance of given "real" firmware images to access SD cards.
>> 
>> Signed-off-by: Bernhard Beschow 
>> ---
>>   docs/system/ppc/ppce500.rst | 13 +
>>   hw/ppc/Kconfig  |  1 +
>>   hw/ppc/e500.c   | 31 ++-
>>   3 files changed, 44 insertions(+), 1 deletion(-)
>
>> +static void dt_sdhc_create(void *fdt, const char *parent, const char *mpic)
>> +{
>> +hwaddr mmio = MPC85XX_ESDHC_REGS_OFFSET;
>> +hwaddr size = MPC85XX_ESDHC_REGS_SIZE;
>> +int irq = MPC85XX_ESDHC_IRQ;
>
>Why not pass these 3 variable as argument?

Besides looking for a way to derive these parameters from QOM properties I 
wanted to keep the code consistent to existing one, e.g. dt_i2c_create().

Best regards,
Bernhard
>
>> +g_autofree char *name = NULL;
>> +
>> +name = g_strdup_printf("%s/sdhc@%" PRIx64, parent, mmio);
>> +qemu_fdt_add_subnode(fdt, name);
>> +qemu_fdt_setprop(fdt, name, "sdhci,auto-cmd12", NULL, 0);
>> +qemu_fdt_setprop_phandle(fdt, name, "interrupt-parent", mpic);
>> +qemu_fdt_setprop_cells(fdt, name, "bus-width", 4);
>> +qemu_fdt_setprop_cells(fdt, name, "interrupts", irq, 0x2);
>> +qemu_fdt_setprop_cells(fdt, name, "reg", mmio, size);
>> +qemu_fdt_setprop_string(fdt, name, "compatible", "fsl,esdhc");
>> +}
>> typedef struct PlatformDevtreeData {
>>   void *fdt;
>> @@ -553,6 +573,8 @@ static int ppce500_load_device_tree(PPCE500MachineState 
>> *pms,
>> dt_rtc_create(fdt, "i2c", "rtc");
>>   +/* sdhc */
>> +dt_sdhc_create(fdt, soc, mpic);
>>   



[PATCH v3 5/9] hw/sd/sdhci-internal: Unexport ESDHC defines

2022-10-16 Thread Bernhard Beschow
These defines aren't used outside of sdhci.c, so can be defined there.

Signed-off-by: Bernhard Beschow 
Reviewed-by: Bin Meng 
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/sd/sdhci-internal.h | 20 
 hw/sd/sdhci.c  | 19 +++
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/hw/sd/sdhci-internal.h b/hw/sd/sdhci-internal.h
index e8c753d6d1..964570f8e8 100644
--- a/hw/sd/sdhci-internal.h
+++ b/hw/sd/sdhci-internal.h
@@ -288,26 +288,6 @@ enum {
 
 extern const VMStateDescription sdhci_vmstate;
 
-
-#define ESDHC_MIX_CTRL  0x48
-
-#define ESDHC_VENDOR_SPEC   0xc0
-#define ESDHC_IMX_FRC_SDCLK_ON  (1 << 8)
-
-#define ESDHC_DLL_CTRL  0x60
-
-#define ESDHC_TUNING_CTRL   0xcc
-#define ESDHC_TUNE_CTRL_STATUS  0x68
-#define ESDHC_WTMK_LVL  0x44
-
-/* Undocumented register used by guests working around erratum ERR004536 */
-#define ESDHC_UNDOCUMENTED_REG270x6c
-
-#define ESDHC_CTRL_4BITBUS  (0x1 << 1)
-#define ESDHC_CTRL_8BITBUS  (0x2 << 1)
-
-#define ESDHC_PRNSTS_SDSTB  (1 << 3)
-
 /*
  * Default SD/MMC host controller features information, which will be
  * presented in CAPABILITIES register of generic SD host controller at reset.
diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c
index 0e5e988927..6da5e2c781 100644
--- a/hw/sd/sdhci.c
+++ b/hw/sd/sdhci.c
@@ -1577,6 +1577,25 @@ static const TypeInfo sdhci_bus_info = {
 
 /* --- qdev i.MX eSDHC --- */
 
+#define ESDHC_MIX_CTRL  0x48
+
+#define ESDHC_VENDOR_SPEC   0xc0
+#define ESDHC_IMX_FRC_SDCLK_ON  (1 << 8)
+
+#define ESDHC_DLL_CTRL  0x60
+
+#define ESDHC_TUNING_CTRL   0xcc
+#define ESDHC_TUNE_CTRL_STATUS  0x68
+#define ESDHC_WTMK_LVL  0x44
+
+/* Undocumented register used by guests working around erratum ERR004536 */
+#define ESDHC_UNDOCUMENTED_REG270x6c
+
+#define ESDHC_CTRL_4BITBUS  (0x1 << 1)
+#define ESDHC_CTRL_8BITBUS  (0x2 << 1)
+
+#define ESDHC_PRNSTS_SDSTB  (1 << 3)
+
 static uint64_t usdhc_read(void *opaque, hwaddr offset, unsigned size)
 {
 SDHCIState *s = SYSBUS_SDHCI(opaque);
-- 
2.38.0




[PATCH v3 7/9] hw/ppc/e500: Implement pflash handling

2022-10-16 Thread Bernhard Beschow
Allows e500 boards to have their root file system reside on flash using
only builtin devices located in the eLBC memory region.

Note that the flash memory area is only created when a -pflash argument is
given, and that the size is determined by the given file. The idea is to
put users into control.

Signed-off-by: Bernhard Beschow 
---
 docs/system/ppc/ppce500.rst | 16 ++
 hw/ppc/Kconfig  |  1 +
 hw/ppc/e500.c   | 62 +
 3 files changed, 79 insertions(+)

diff --git a/docs/system/ppc/ppce500.rst b/docs/system/ppc/ppce500.rst
index ba6bcb7314..99d2c680d6 100644
--- a/docs/system/ppc/ppce500.rst
+++ b/docs/system/ppc/ppce500.rst
@@ -165,3 +165,19 @@ if “-device eTSEC” is given to QEMU:
 .. code-block:: bash
 
   -netdev tap,ifname=tap0,script=no,downscript=no,id=net0 -device 
eTSEC,netdev=net0
+
+Root file system on flash drive
+---
+
+Rather than using a root file system on ram disk, it is possible to have it on
+CFI flash. Given an ext2 image whose size must be a power of two, it can be 
used
+as follows:
+
+.. code-block:: bash
+
+  $ qemu-system-ppc{64|32} -M ppce500 -cpu e500mc -smp 4 -m 2G \
+  -display none -serial stdio \
+  -kernel vmlinux \
+  -drive if=pflash,file=/path/to/rootfs.ext2,format=raw \
+  -append "rootwait root=/dev/mtdblock0"
+
diff --git a/hw/ppc/Kconfig b/hw/ppc/Kconfig
index 791fe78a50..769a1ead1c 100644
--- a/hw/ppc/Kconfig
+++ b/hw/ppc/Kconfig
@@ -126,6 +126,7 @@ config E500
 select ETSEC
 select GPIO_MPC8XXX
 select OPENPIC
+select PFLASH_CFI01
 select PLATFORM_BUS
 select PPCE500_PCI
 select SERIAL
diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c
index 3e950ea3ba..23d2c3451a 100644
--- a/hw/ppc/e500.c
+++ b/hw/ppc/e500.c
@@ -23,8 +23,10 @@
 #include "e500-ccsr.h"
 #include "net/net.h"
 #include "qemu/config-file.h"
+#include "hw/block/flash.h"
 #include "hw/char/serial.h"
 #include "hw/pci/pci.h"
+#include "sysemu/block-backend-io.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/kvm.h"
 #include "sysemu/reset.h"
@@ -267,6 +269,31 @@ static void sysbus_device_create_devtree(SysBusDevice 
*sbdev, void *opaque)
 }
 }
 
+static void create_devtree_flash(SysBusDevice *sbdev,
+ PlatformDevtreeData *data)
+{
+g_autofree char *name = NULL;
+uint64_t num_blocks = object_property_get_uint(OBJECT(sbdev),
+   "num-blocks",
+   _fatal);
+uint64_t sector_length = object_property_get_uint(OBJECT(sbdev),
+  "sector-length",
+  _fatal);
+uint64_t bank_width = object_property_get_uint(OBJECT(sbdev),
+   "width",
+   _fatal);
+hwaddr flashbase = 0;
+hwaddr flashsize = num_blocks * sector_length;
+void *fdt = data->fdt;
+
+name = g_strdup_printf("%s/nor@%" PRIx64, data->node, flashbase);
+qemu_fdt_add_subnode(fdt, name);
+qemu_fdt_setprop_string(fdt, name, "compatible", "cfi-flash");
+qemu_fdt_setprop_sized_cells(fdt, name, "reg",
+ 1, flashbase, 1, flashsize);
+qemu_fdt_setprop_cell(fdt, name, "bank-width", bank_width);
+}
+
 static void platform_bus_create_devtree(PPCE500MachineState *pms,
 void *fdt, const char *mpic)
 {
@@ -276,6 +303,8 @@ static void platform_bus_create_devtree(PPCE500MachineState 
*pms,
 uint64_t addr = pmc->platform_bus_base;
 uint64_t size = pmc->platform_bus_size;
 int irq_start = pmc->platform_bus_first_irq;
+SysBusDevice *sbdev;
+bool ambiguous;
 
 /* Create a /platform node that we can put all devices into */
 
@@ -302,6 +331,13 @@ static void 
platform_bus_create_devtree(PPCE500MachineState *pms,
 /* Loop through all dynamic sysbus devices and create nodes for them */
 foreach_dynamic_sysbus_device(sysbus_device_create_devtree, );
 
+sbdev = SYS_BUS_DEVICE(object_resolve_path_type("", TYPE_PFLASH_CFI01,
+));
+if (sbdev) {
+assert(!ambiguous);
+create_devtree_flash(sbdev, );
+}
+
 g_free(node);
 }
 
@@ -856,6 +892,7 @@ void ppce500_init(MachineState *machine)
 unsigned int pci_irq_nrs[PCI_NUM_PINS] = {1, 2, 3, 4};
 IrqLines *irqs;
 DeviceState *dev, *mpicdev;
+DriveInfo *dinfo;
 CPUPPCState *firstenv = NULL;
 MemoryRegion *ccsr_addr_space;
 SysBusDevice *s;
@@ -1024,6 +1061,31 @@ void ppce500_init(MachineState *machine)
 pmc->platform_bus_base,
 >pbus_dev->mmio);
 
+dinfo = drive_get(IF_PFLASH, 0, 0);
+if (dinfo) {
+BlockBackend *blk = blk_by_legacy_dinfo(dinfo);
+

[PATCH v3 8/9] hw/sd/sdhci: Implement Freescale eSDHC device model

2022-10-16 Thread Bernhard Beschow
Will allow e500 boards to access SD cards using just their own devices.

Signed-off-by: Bernhard Beschow 
---
 hw/sd/sdhci.c | 120 +-
 include/hw/sd/sdhci.h |   3 ++
 2 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c
index 306070c872..8d8ad9ff24 100644
--- a/hw/sd/sdhci.c
+++ b/hw/sd/sdhci.c
@@ -1369,6 +1369,7 @@ void sdhci_initfn(SDHCIState *s)
 s->transfer_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, sdhci_data_transfer, 
s);
 
 s->io_ops = _mmio_ops;
+s->io_registers_map_size = SDHC_REGISTERS_MAP_SIZE;
 }
 
 void sdhci_uninitfn(SDHCIState *s)
@@ -1392,7 +1393,7 @@ void sdhci_common_realize(SDHCIState *s, Error **errp)
 s->fifo_buffer = g_malloc0(s->buf_maxsz);
 
 memory_region_init_io(>iomem, OBJECT(s), s->io_ops, s, "sdhci",
-  SDHC_REGISTERS_MAP_SIZE);
+  s->io_registers_map_size);
 }
 
 void sdhci_common_unrealize(SDHCIState *s)
@@ -1575,6 +1576,122 @@ static const TypeInfo sdhci_bus_info = {
 .class_init = sdhci_bus_class_init,
 };
 
+/* --- qdev Freescale eSDHC --- */
+
+/* Watermark Level Register */
+#define ESDHC_WML0x44
+
+/* Control Register for DMA transfer */
+#define ESDHC_DMA_SYSCTL0x40c
+
+#define ESDHC_REGISTERS_MAP_SIZE0x410
+
+static uint64_t esdhci_read(void *opaque, hwaddr offset, unsigned size)
+{
+uint64_t ret;
+
+switch (offset) {
+case SDHC_SYSAD:
+case SDHC_BLKSIZE:
+case SDHC_ARGUMENT:
+case SDHC_TRNMOD:
+case SDHC_RSPREG0:
+case SDHC_RSPREG1:
+case SDHC_RSPREG2:
+case SDHC_RSPREG3:
+case SDHC_BDATA:
+case SDHC_PRNSTS:
+case SDHC_HOSTCTL:
+case SDHC_CLKCON:
+case SDHC_NORINTSTS:
+case SDHC_NORINTSTSEN:
+case SDHC_NORINTSIGEN:
+case SDHC_ACMD12ERRSTS:
+case SDHC_CAPAB:
+case SDHC_SLOT_INT_STATUS:
+ret = sdhci_read(opaque, offset, size);
+break;
+
+case ESDHC_WML:
+case ESDHC_DMA_SYSCTL:
+ret = 0;
+qemu_log_mask(LOG_UNIMP, "ESDHC rd @0x%02" HWADDR_PRIx
+  " not implemented\n", offset);
+break;
+
+default:
+ret = 0;
+qemu_log_mask(LOG_GUEST_ERROR, "ESDHC rd @0x%02" HWADDR_PRIx
+  " unknown offset\n", offset);
+break;
+}
+
+return ret;
+}
+
+static void esdhci_write(void *opaque, hwaddr offset, uint64_t val,
+ unsigned size)
+{
+switch (offset) {
+case SDHC_SYSAD:
+case SDHC_BLKSIZE:
+case SDHC_ARGUMENT:
+case SDHC_TRNMOD:
+case SDHC_BDATA:
+case SDHC_HOSTCTL:
+case SDHC_CLKCON:
+case SDHC_NORINTSTS:
+case SDHC_NORINTSTSEN:
+case SDHC_NORINTSIGEN:
+case SDHC_FEAER:
+sdhci_write(opaque, offset, val, size);
+break;
+
+case ESDHC_WML:
+case ESDHC_DMA_SYSCTL:
+qemu_log_mask(LOG_UNIMP, "ESDHC wr @0x%02" HWADDR_PRIx " <- 0x%08lx "
+  "not implemented\n", offset, val);
+break;
+
+default:
+qemu_log_mask(LOG_GUEST_ERROR, "ESDHC wr @0x%02" HWADDR_PRIx
+  " <- 0x%08lx unknown offset\n", offset, val);
+break;
+}
+}
+
+static const MemoryRegionOps esdhc_mmio_ops = {
+.read = esdhci_read,
+.write = esdhci_write,
+.valid = {
+.min_access_size = 4,
+.max_access_size = 4,
+.unaligned = false
+},
+.endianness = DEVICE_BIG_ENDIAN,
+};
+
+static void esdhci_init(Object *obj)
+{
+DeviceState *dev = DEVICE(obj);
+SDHCIState *s = SYSBUS_SDHCI(obj);
+
+s->io_ops = _mmio_ops;
+s->io_registers_map_size = ESDHC_REGISTERS_MAP_SIZE;
+
+/*
+ * Compatible with:
+ * - SD Host Controller Specification Version 2.0 Part A2
+ */
+qdev_prop_set_uint8(dev, "sd-spec-version", 2);
+}
+
+static const TypeInfo esdhc_info = {
+.name = TYPE_FSL_ESDHC,
+.parent = TYPE_SYSBUS_SDHCI,
+.instance_init = esdhci_init,
+};
+
 /* --- qdev i.MX eSDHC --- */
 
 #define USDHC_MIX_CTRL  0x48
@@ -1907,6 +2024,7 @@ static void sdhci_register_types(void)
 {
 type_register_static(_sysbus_info);
 type_register_static(_bus_info);
+type_register_static(_info);
 type_register_static(_usdhc_info);
 type_register_static(_s3c_info);
 }
diff --git a/include/hw/sd/sdhci.h b/include/hw/sd/sdhci.h
index 01a64c5442..5b32e83eee 100644
--- a/include/hw/sd/sdhci.h
+++ b/include/hw/sd/sdhci.h
@@ -45,6 +45,7 @@ struct SDHCIState {
 AddressSpace *dma_as;
 MemoryRegion *dma_mr;
 const MemoryRegionOps *io_ops;
+uint64_t io_registers_map_size;
 
 QEMUTimer *insert_timer;   /* timer for 'changing' sd card. */
 QEMUTimer *transfer_timer;
@@ -122,6 +123,8 @@ DECLARE_INSTANCE_CHECKER(SDHCIState, PCI_SDHCI,
 DECLARE_INSTANCE_CHECKER(SDHCIState, SYSBUS_SDHCI,
  TYPE_SYSBUS_SDHCI)
 
+#define TYPE_FSL_ESDHC "fsl-esdhc"
+
 

[PATCH v3 6/9] hw/sd/sdhci: Rename ESDHC_* defines to USDHC_*

2022-10-16 Thread Bernhard Beschow
The device model's functions start with "usdhc_", so rename the defines
accordingly for consistency.

Signed-off-by: Bernhard Beschow 
Reviewed-by: Bin Meng 
---
 hw/sd/sdhci.c | 66 +--
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c
index 6da5e2c781..306070c872 100644
--- a/hw/sd/sdhci.c
+++ b/hw/sd/sdhci.c
@@ -1577,24 +1577,24 @@ static const TypeInfo sdhci_bus_info = {
 
 /* --- qdev i.MX eSDHC --- */
 
-#define ESDHC_MIX_CTRL  0x48
+#define USDHC_MIX_CTRL  0x48
 
-#define ESDHC_VENDOR_SPEC   0xc0
-#define ESDHC_IMX_FRC_SDCLK_ON  (1 << 8)
+#define USDHC_VENDOR_SPEC   0xc0
+#define USDHC_IMX_FRC_SDCLK_ON  (1 << 8)
 
-#define ESDHC_DLL_CTRL  0x60
+#define USDHC_DLL_CTRL  0x60
 
-#define ESDHC_TUNING_CTRL   0xcc
-#define ESDHC_TUNE_CTRL_STATUS  0x68
-#define ESDHC_WTMK_LVL  0x44
+#define USDHC_TUNING_CTRL   0xcc
+#define USDHC_TUNE_CTRL_STATUS  0x68
+#define USDHC_WTMK_LVL  0x44
 
 /* Undocumented register used by guests working around erratum ERR004536 */
-#define ESDHC_UNDOCUMENTED_REG270x6c
+#define USDHC_UNDOCUMENTED_REG270x6c
 
-#define ESDHC_CTRL_4BITBUS  (0x1 << 1)
-#define ESDHC_CTRL_8BITBUS  (0x2 << 1)
+#define USDHC_CTRL_4BITBUS  (0x1 << 1)
+#define USDHC_CTRL_8BITBUS  (0x2 << 1)
 
-#define ESDHC_PRNSTS_SDSTB  (1 << 3)
+#define USDHC_PRNSTS_SDSTB  (1 << 3)
 
 static uint64_t usdhc_read(void *opaque, hwaddr offset, unsigned size)
 {
@@ -1615,11 +1615,11 @@ static uint64_t usdhc_read(void *opaque, hwaddr offset, 
unsigned size)
 hostctl1 = SDHC_DMA_TYPE(s->hostctl1) << (8 - 3);
 
 if (s->hostctl1 & SDHC_CTRL_8BITBUS) {
-hostctl1 |= ESDHC_CTRL_8BITBUS;
+hostctl1 |= USDHC_CTRL_8BITBUS;
 }
 
 if (s->hostctl1 & SDHC_CTRL_4BITBUS) {
-hostctl1 |= ESDHC_CTRL_4BITBUS;
+hostctl1 |= USDHC_CTRL_4BITBUS;
 }
 
 ret  = hostctl1;
@@ -1630,21 +1630,21 @@ static uint64_t usdhc_read(void *opaque, hwaddr offset, 
unsigned size)
 
 case SDHC_PRNSTS:
 /* Add SDSTB (SD Clock Stable) bit to PRNSTS */
-ret = sdhci_read(opaque, offset, size) & ~ESDHC_PRNSTS_SDSTB;
+ret = sdhci_read(opaque, offset, size) & ~USDHC_PRNSTS_SDSTB;
 if (s->clkcon & SDHC_CLOCK_INT_STABLE) {
-ret |= ESDHC_PRNSTS_SDSTB;
+ret |= USDHC_PRNSTS_SDSTB;
 }
 break;
 
-case ESDHC_VENDOR_SPEC:
+case USDHC_VENDOR_SPEC:
 ret = s->vendor_spec;
 break;
-case ESDHC_DLL_CTRL:
-case ESDHC_TUNE_CTRL_STATUS:
-case ESDHC_UNDOCUMENTED_REG27:
-case ESDHC_TUNING_CTRL:
-case ESDHC_MIX_CTRL:
-case ESDHC_WTMK_LVL:
+case USDHC_DLL_CTRL:
+case USDHC_TUNE_CTRL_STATUS:
+case USDHC_UNDOCUMENTED_REG27:
+case USDHC_TUNING_CTRL:
+case USDHC_MIX_CTRL:
+case USDHC_WTMK_LVL:
 ret = 0;
 break;
 }
@@ -1660,18 +1660,18 @@ usdhc_write(void *opaque, hwaddr offset, uint64_t val, 
unsigned size)
 uint32_t value = (uint32_t)val;
 
 switch (offset) {
-case ESDHC_DLL_CTRL:
-case ESDHC_TUNE_CTRL_STATUS:
-case ESDHC_UNDOCUMENTED_REG27:
-case ESDHC_TUNING_CTRL:
-case ESDHC_WTMK_LVL:
+case USDHC_DLL_CTRL:
+case USDHC_TUNE_CTRL_STATUS:
+case USDHC_UNDOCUMENTED_REG27:
+case USDHC_TUNING_CTRL:
+case USDHC_WTMK_LVL:
 break;
 
-case ESDHC_VENDOR_SPEC:
+case USDHC_VENDOR_SPEC:
 s->vendor_spec = value;
 switch (s->vendor) {
 case SDHCI_VENDOR_IMX:
-if (value & ESDHC_IMX_FRC_SDCLK_ON) {
+if (value & USDHC_IMX_FRC_SDCLK_ON) {
 s->prnsts &= ~SDHC_IMX_CLOCK_GATE_OFF;
 } else {
 s->prnsts |= SDHC_IMX_CLOCK_GATE_OFF;
@@ -1740,12 +1740,12 @@ usdhc_write(void *opaque, hwaddr offset, uint64_t val, 
unsigned size)
  * Second, split "Data Transfer Width" from bits 2 and 1 in to
  * bits 5 and 1
  */
-if (value & ESDHC_CTRL_8BITBUS) {
+if (value & USDHC_CTRL_8BITBUS) {
 hostctl1 |= SDHC_CTRL_8BITBUS;
 }
 
-if (value & ESDHC_CTRL_4BITBUS) {
-hostctl1 |= ESDHC_CTRL_4BITBUS;
+if (value & USDHC_CTRL_4BITBUS) {
+hostctl1 |= USDHC_CTRL_4BITBUS;
 }
 
 /*
@@ -1768,7 +1768,7 @@ usdhc_write(void *opaque, hwaddr offset, uint64_t val, 
unsigned size)
 sdhci_write(opaque, offset, value, size);
 break;
 
-case ESDHC_MIX_CTRL:
+case USDHC_MIX_CTRL:
 /*
  * So, when SD/MMC stack in Linux tries to write to "Transfer
  * Mode Register", ESDHC i.MX quirk code will translate it
-- 
2.38.0




[PATCH v3 3/9] hw/block/pflash_cfi01: Attach memory region in boards

2022-10-16 Thread Bernhard Beschow
pflash_cfi01_register() had a parameter which was only passed to
sysbus_mmio_map() but not used otherwise. Pulling out sysbus_mmio_map()
resolves that parameter and concentrates the memory region setup in
board code. Furthermore, it allows attaching cfi01 devices relative to
some parent bus rather than to the global "sysbus".

While at it, replace sysbus_mmio_map() with non-sysbus equivalents.

Signed-off-by: Bernhard Beschow 
---
 hw/arm/collie.c  | 20 +---
 hw/arm/gumstix.c | 18 --
 hw/arm/mainstone.c   | 16 ++--
 hw/arm/omap_sx1.c| 19 +++
 hw/arm/versatilepb.c | 12 +++-
 hw/arm/z2.c  |  9 ++---
 hw/block/pflash_cfi01.c  |  4 +---
 hw/microblaze/petalogix_ml605_mmu.c  | 16 ++--
 hw/microblaze/petalogix_s3adsp1800_mmu.c | 10 ++
 hw/mips/malta.c  |  4 ++--
 hw/ppc/sam460ex.c| 15 +--
 hw/ppc/virtex_ml507.c|  5 -
 include/hw/block/flash.h |  3 +--
 13 files changed, 92 insertions(+), 59 deletions(-)

diff --git a/hw/arm/collie.c b/hw/arm/collie.c
index 8df31e2793..25fb5f657b 100644
--- a/hw/arm/collie.c
+++ b/hw/arm/collie.c
@@ -37,8 +37,10 @@ static struct arm_boot_info collie_binfo = {
 static void collie_init(MachineState *machine)
 {
 DriveInfo *dinfo;
+PFlashCFI01 *pfl;
 MachineClass *mc = MACHINE_GET_CLASS(machine);
 CollieMachineState *cms = COLLIE_MACHINE(machine);
+MemoryRegion *system_memory = get_system_memory();
 
 if (machine->ram_size != mc->default_ram_size) {
 char *sz = size_to_str(mc->default_ram_size);
@@ -49,17 +51,21 @@ static void collie_init(MachineState *machine)
 
 cms->sa1110 = sa1110_init(machine->cpu_type);
 
-memory_region_add_subregion(get_system_memory(), SA_SDCS0, machine->ram);
+memory_region_add_subregion(system_memory, SA_SDCS0, machine->ram);
 
 dinfo = drive_get(IF_PFLASH, 0, 0);
-pflash_cfi01_register(SA_CS0, "collie.fl1", 0x0200,
-dinfo ? blk_by_legacy_dinfo(dinfo) : NULL,
-64 * KiB, 4, 0x00, 0x00, 0x00, 0x00, 0);
+pfl = pflash_cfi01_register("collie.fl1", 0x0200,
+dinfo ? blk_by_legacy_dinfo(dinfo) : NULL,
+64 * KiB, 4, 0x00, 0x00, 0x00, 0x00, 0);
+memory_region_add_subregion(system_memory, SA_CS0,
+pflash_cfi01_get_memory(pfl));
 
 dinfo = drive_get(IF_PFLASH, 0, 1);
-pflash_cfi01_register(SA_CS1, "collie.fl2", 0x0200,
-dinfo ? blk_by_legacy_dinfo(dinfo) : NULL,
-64 * KiB, 4, 0x00, 0x00, 0x00, 0x00, 0);
+pfl = pflash_cfi01_register("collie.fl2", 0x0200,
+dinfo ? blk_by_legacy_dinfo(dinfo) : NULL,
+64 * KiB, 4, 0x00, 0x00, 0x00, 0x00, 0);
+memory_region_add_subregion(system_memory, SA_CS1,
+pflash_cfi01_get_memory(pfl));
 
 sysbus_create_simple("scoop", 0x4080, NULL);
 
diff --git a/hw/arm/gumstix.c b/hw/arm/gumstix.c
index 1296628ed9..d6c997ad8e 100644
--- a/hw/arm/gumstix.c
+++ b/hw/arm/gumstix.c
@@ -51,6 +51,7 @@ static void connex_init(MachineState *machine)
 {
 PXA2xxState *cpu;
 DriveInfo *dinfo;
+PFlashCFI01 *pfl;
 MemoryRegion *address_space_mem = get_system_memory();
 
 uint32_t connex_rom = 0x0100;
@@ -65,9 +66,11 @@ static void connex_init(MachineState *machine)
 exit(1);
 }
 
-pflash_cfi01_register(0x, "connext.rom", connex_rom,
-  dinfo ? blk_by_legacy_dinfo(dinfo) : NULL,
-  sector_len, 2, 0, 0, 0, 0, 0);
+pfl = pflash_cfi01_register("connext.rom", connex_rom,
+dinfo ? blk_by_legacy_dinfo(dinfo) : NULL,
+sector_len, 2, 0, 0, 0, 0, 0);
+memory_region_add_subregion(address_space_mem, 0x,
+pflash_cfi01_get_memory(pfl));
 
 /* Interrupt line of NIC is connected to GPIO line 36 */
 smc91c111_init(_table[0], 0x04000300,
@@ -78,6 +81,7 @@ static void verdex_init(MachineState *machine)
 {
 PXA2xxState *cpu;
 DriveInfo *dinfo;
+PFlashCFI01 *pfl;
 MemoryRegion *address_space_mem = get_system_memory();
 
 uint32_t verdex_rom = 0x0200;
@@ -92,9 +96,11 @@ static void verdex_init(MachineState *machine)
 exit(1);
 }
 
-pflash_cfi01_register(0x, "verdex.rom", verdex_rom,
-  dinfo ? blk_by_legacy_dinfo(dinfo) : NULL,
-  sector_len, 2, 0, 0, 0, 0, 0);
+pfl = pflash_cfi01_register("verdex.rom", verdex_rom,
+dinfo ? 

[PATCH v3 2/9] hw/{arm,ppc}: Resolve unreachable code

2022-10-16 Thread Bernhard Beschow
pflash_cfi01_register() always returns with a non-NULL pointer (otherwise
it would crash internally). Therefore, the bodies of the if-statements
are unreachable.

Signed-off-by: Bernhard Beschow 
---
 hw/arm/gumstix.c | 18 ++
 hw/arm/mainstone.c   | 13 +
 hw/arm/omap_sx1.c| 22 --
 hw/arm/versatilepb.c |  6 ++
 hw/arm/z2.c  |  9 +++--
 hw/ppc/sam460ex.c| 12 
 6 files changed, 28 insertions(+), 52 deletions(-)

diff --git a/hw/arm/gumstix.c b/hw/arm/gumstix.c
index 3a4bc332c4..1296628ed9 100644
--- a/hw/arm/gumstix.c
+++ b/hw/arm/gumstix.c
@@ -65,12 +65,9 @@ static void connex_init(MachineState *machine)
 exit(1);
 }
 
-if (!pflash_cfi01_register(0x, "connext.rom", connex_rom,
-   dinfo ? blk_by_legacy_dinfo(dinfo) : NULL,
-   sector_len, 2, 0, 0, 0, 0, 0)) {
-error_report("Error registering flash memory");
-exit(1);
-}
+pflash_cfi01_register(0x, "connext.rom", connex_rom,
+  dinfo ? blk_by_legacy_dinfo(dinfo) : NULL,
+  sector_len, 2, 0, 0, 0, 0, 0);
 
 /* Interrupt line of NIC is connected to GPIO line 36 */
 smc91c111_init(_table[0], 0x04000300,
@@ -95,12 +92,9 @@ static void verdex_init(MachineState *machine)
 exit(1);
 }
 
-if (!pflash_cfi01_register(0x, "verdex.rom", verdex_rom,
-   dinfo ? blk_by_legacy_dinfo(dinfo) : NULL,
-   sector_len, 2, 0, 0, 0, 0, 0)) {
-error_report("Error registering flash memory");
-exit(1);
-}
+pflash_cfi01_register(0x, "verdex.rom", verdex_rom,
+  dinfo ? blk_by_legacy_dinfo(dinfo) : NULL,
+  sector_len, 2, 0, 0, 0, 0, 0);
 
 /* Interrupt line of NIC is connected to GPIO line 99 */
 smc91c111_init(_table[0], 0x04000300,
diff --git a/hw/arm/mainstone.c b/hw/arm/mainstone.c
index 8454b65458..40f708f2d3 100644
--- a/hw/arm/mainstone.c
+++ b/hw/arm/mainstone.c
@@ -130,14 +130,11 @@ static void mainstone_common_init(MemoryRegion 
*address_space_mem,
 /* There are two 32MiB flash devices on the board */
 for (i = 0; i < 2; i ++) {
 dinfo = drive_get(IF_PFLASH, 0, i);
-if (!pflash_cfi01_register(mainstone_flash_base[i],
-   i ? "mainstone.flash1" : "mainstone.flash0",
-   MAINSTONE_FLASH,
-   dinfo ? blk_by_legacy_dinfo(dinfo) : NULL,
-   sector_len, 4, 0, 0, 0, 0, 0)) {
-error_report("Error registering flash memory");
-exit(1);
-}
+pflash_cfi01_register(mainstone_flash_base[i],
+  i ? "mainstone.flash1" : "mainstone.flash0",
+  MAINSTONE_FLASH,
+  dinfo ? blk_by_legacy_dinfo(dinfo) : NULL,
+  sector_len, 4, 0, 0, 0, 0, 0);
 }
 
 mst_irq = sysbus_create_simple("mainstone-fpga", MST_FPGA_PHYS,
diff --git a/hw/arm/omap_sx1.c b/hw/arm/omap_sx1.c
index 57829b3744..820652265b 100644
--- a/hw/arm/omap_sx1.c
+++ b/hw/arm/omap_sx1.c
@@ -153,13 +153,10 @@ static void sx1_init(MachineState *machine, const int 
version)
 
 fl_idx = 0;
 if ((dinfo = drive_get(IF_PFLASH, 0, fl_idx)) != NULL) {
-if (!pflash_cfi01_register(OMAP_CS0_BASE,
-   "omap_sx1.flash0-1", flash_size,
-   blk_by_legacy_dinfo(dinfo),
-   sector_size, 4, 0, 0, 0, 0, 0)) {
-fprintf(stderr, "qemu: Error registering flash memory %d.\n",
-   fl_idx);
-}
+pflash_cfi01_register(OMAP_CS0_BASE,
+  "omap_sx1.flash0-1", flash_size,
+  blk_by_legacy_dinfo(dinfo),
+  sector_size, 4, 0, 0, 0, 0, 0);
 fl_idx++;
 }
 
@@ -175,13 +172,10 @@ static void sx1_init(MachineState *machine, const int 
version)
 memory_region_add_subregion(address_space,
 OMAP_CS1_BASE + flash1_size, [1]);
 
-if (!pflash_cfi01_register(OMAP_CS1_BASE,
-   "omap_sx1.flash1-1", flash1_size,
-   blk_by_legacy_dinfo(dinfo),
-   sector_size, 4, 0, 0, 0, 0, 0)) {
-fprintf(stderr, "qemu: Error registering flash memory %d.\n",
-   fl_idx);
-}
+pflash_cfi01_register(OMAP_CS1_BASE,
+  "omap_sx1.flash1-1", flash1_size,
+  blk_by_legacy_dinfo(dinfo),
+  sector_size, 4, 0, 0, 0, 0, 0);
 fl_idx++;
 } else {
 

[PATCH v3 0/9] ppc/e500: Add support for two types of flash, cleanup

2022-10-16 Thread Bernhard Beschow
Cover letter:
~

This series adds support for -pflash and direct SD card access to the
PPC e500 boards. The idea is to increase compatibility with "real" firmware
images where only the bare minimum of drivers is compiled in.

The series is structured as follows:

Patches 1-6 perform some general cleanup which paves the way for the rest of
the series.

Patch 7 adds -pflash handling where memory-mapped flash can be added on
user's behalf. That is, the flash memory region in the eLBC is only added if
the -pflash argument is supplied. Note that the cfi01 device model becomes
stricter in checking the size of the emulated flash space.

Patches 8 and 9 add a new device model - the Freescale eSDHC - to the e500
boards which was missing so far.

User documentation is also added as the new features become available.

Tesing done:
* `qemu-system-ppc -M ppce500 -cpu e500mc -m 256 -kernel uImage -append
"console=ttyS0 rootwait root=/dev/mtdblock0 nokaslr" -drive
if=pflash,file=rootfs.ext2,format=raw`
* `qemu-system-ppc -M ppce500 -cpu e500mc -m 256 -kernel uImage -append
"console=ttyS0 rootwait root=/dev/mmcblk0" -device sd-card,drive=mydrive -drive
id=mydrive,if=none,file=rootfs.ext2,format=raw`

The load was created using latest Buildroot with `make
qemu_ppc_e500mc_defconfig` where the rootfs was configured to be of ext2 type.
In both cases it was possible to log in and explore the root file system.

v3:
~~~
Phil:
- Also add power-of-2 fix to pflash_cfi02
- Resolve cfi01-specific assertion in e500 code
- Resolve unused define in eSDHC device model
- Resolve redundant alignment checks in eSDHC device model

Bin:
- Add dedicated flash chapter to documentation

Bernhard:
- Use is_power_of_2() instead of ctpop64() for better readability
- Only instantiate eSDHC device model in ppce500 (not used in MPC8544DS)
- Rebase onto gitlab.com/danielhb/qemu/tree/ppc-next

v2:
~~~
Bin:
- Add source for MPC8544DS platform bus' memory map in commit message.
- Keep "ESDHC" in comment referring to Linux driver.
- Use "qemu-system-ppc{64|32} in documentation.
- Use g_autofree in device tree code.
- Remove unneeded device tree properties.
- Error out if pflash size doesn't fit into eLBC memory window.
- Remove unused ESDHC defines.
- Define macro ESDHC_WML for register offset with magic constant.
- Fix some whitespace issues when adding eSDHC device to e500.

Phil:
- Fix tense in commit message.

Bernhard Beschow (9):
  hw/block/pflash_cfi0{1,2}: Error out if device length isn't a power of
two
  hw/{arm,ppc}: Resolve unreachable code
  hw/block/pflash_cfi01: Attach memory region in boards
  hw/block/pflash_cfi02: Attach memory region in boards
  hw/sd/sdhci-internal: Unexport ESDHC defines
  hw/sd/sdhci: Rename ESDHC_* defines to USDHC_*
  hw/ppc/e500: Implement pflash handling
  hw/sd/sdhci: Implement Freescale eSDHC device model
  hw/ppc/e500: Add Freescale eSDHC to e500plat

 docs/system/ppc/ppce500.rst  |  28 
 hw/arm/collie.c  |  20 ++-
 hw/arm/digic_boards.c|  16 +-
 hw/arm/gumstix.c |  24 +--
 hw/arm/mainstone.c   |  15 +-
 hw/arm/musicpal.c|  15 +-
 hw/arm/omap_sx1.c|  25 ++--
 hw/arm/versatilepb.c |  14 +-
 hw/arm/xilinx_zynq.c |  12 +-
 hw/arm/z2.c  |  12 +-
 hw/block/pflash_cfi01.c  |  12 +-
 hw/block/pflash_cfi02.c  |  14 +-
 hw/microblaze/petalogix_ml605_mmu.c  |  16 +-
 hw/microblaze/petalogix_s3adsp1800_mmu.c |  10 +-
 hw/mips/malta.c  |   4 +-
 hw/ppc/Kconfig   |   2 +
 hw/ppc/e500.c|  97 +++-
 hw/ppc/e500.h|   1 +
 hw/ppc/e500plat.c|   1 +
 hw/ppc/sam460ex.c|  19 ++-
 hw/ppc/virtex_ml507.c|   5 +-
 hw/sd/sdhci-internal.h   |  20 ---
 hw/sd/sdhci.c| 183 ---
 hw/sh4/r2d.c |  11 +-
 include/hw/block/flash.h |   7 +-
 include/hw/sd/sdhci.h|   3 +
 26 files changed, 433 insertions(+), 153 deletions(-)

-- 
2.38.0




[PATCH v3 4/9] hw/block/pflash_cfi02: Attach memory region in boards

2022-10-16 Thread Bernhard Beschow
pflash_cfi02_register() had a parameter which was only passed to
sysbus_mmio_map() but not used otherwise. Pulling out sysbus_mmio_map()
resolves that parameter and concentrates the memory region setup in
board code. Furthermore, it allows attaching cfi02 devices relative to
some parent bus rather than to the global "sysbus".

While at it, replace sysbus_mmio_map() with non-sysbus equivalents.

Signed-off-by: Bernhard Beschow 
---
 hw/arm/digic_boards.c| 16 ++--
 hw/arm/musicpal.c| 15 +--
 hw/arm/xilinx_zynq.c | 12 +++-
 hw/block/pflash_cfi02.c  |  9 ++---
 hw/sh4/r2d.c | 11 +++
 include/hw/block/flash.h |  4 ++--
 6 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/hw/arm/digic_boards.c b/hw/arm/digic_boards.c
index 4093af09cb..d3c5426cf9 100644
--- a/hw/arm/digic_boards.c
+++ b/hw/arm/digic_boards.c
@@ -116,12 +116,16 @@ static void digic4_add_k8p3215uqb_rom(DigicState *s, 
hwaddr addr,
 #define FLASH_K8P3215UQB_SIZE (4 * 1024 * 1024)
 #define FLASH_K8P3215UQB_SECTOR_SIZE (64 * 1024)
 
-pflash_cfi02_register(addr, "pflash", FLASH_K8P3215UQB_SIZE,
-  NULL, FLASH_K8P3215UQB_SECTOR_SIZE,
-  DIGIC4_ROM_MAX_SIZE / FLASH_K8P3215UQB_SIZE,
-  4,
-  0x00EC, 0x007E, 0x0003, 0x0001,
-  0x0555, 0x2aa, 0);
+PFlashCFI02 *pfl;
+
+pfl = pflash_cfi02_register("pflash", FLASH_K8P3215UQB_SIZE,
+NULL, FLASH_K8P3215UQB_SECTOR_SIZE,
+DIGIC4_ROM_MAX_SIZE / FLASH_K8P3215UQB_SIZE,
+4,
+0x00EC, 0x007E, 0x0003, 0x0001,
+0x0555, 0x2aa, 0);
+memory_region_add_subregion(get_system_memory(), addr,
+pflash_cfi02_get_memory(pfl));
 
 digic_load_rom(s, addr, FLASH_K8P3215UQB_SIZE, filename);
 }
diff --git a/hw/arm/musicpal.c b/hw/arm/musicpal.c
index b65c020115..efad741f6d 100644
--- a/hw/arm/musicpal.c
+++ b/hw/arm/musicpal.c
@@ -1261,6 +1261,7 @@ static void musicpal_init(MachineState *machine)
 /* Register flash */
 dinfo = drive_get(IF_PFLASH, 0, 0);
 if (dinfo) {
+PFlashCFI02 *pfl;
 BlockBackend *blk = blk_by_legacy_dinfo(dinfo);
 
 flash_size = blk_getlength(blk);
@@ -1275,12 +1276,14 @@ static void musicpal_init(MachineState *machine)
  * 0xFF80 (if there is 8 MB flash). So remap flash access if the
  * image is smaller than 32 MB.
  */
-pflash_cfi02_register(0x1ULL - MP_FLASH_SIZE_MAX,
-  "musicpal.flash", flash_size,
-  blk, 0x1,
-  MP_FLASH_SIZE_MAX / flash_size,
-  2, 0x00BF, 0x236D, 0x, 0x,
-  0x, 0x2AAA, 0);
+pfl = pflash_cfi02_register("musicpal.flash", flash_size,
+blk, 0x1,
+MP_FLASH_SIZE_MAX / flash_size,
+2, 0x00BF, 0x236D, 0x, 0x,
+0x, 0x2AAA, 0);
+memory_region_add_subregion(address_space_mem,
+0x1ULL - MP_FLASH_SIZE_MAX,
+pflash_cfi02_get_memory(pfl));
 }
 sysbus_create_simple(TYPE_MV88W8618_FLASHCFG, MP_FLASHCFG_BASE, NULL);
 
diff --git a/hw/arm/xilinx_zynq.c b/hw/arm/xilinx_zynq.c
index 3190cc0b8d..a2abb1cf31 100644
--- a/hw/arm/xilinx_zynq.c
+++ b/hw/arm/xilinx_zynq.c
@@ -182,6 +182,7 @@ static void zynq_init(MachineState *machine)
 MemoryRegion *ocm_ram = g_new(MemoryRegion, 1);
 DeviceState *dev, *slcr;
 SysBusDevice *busdev;
+PFlashCFI02 *pfl;
 qemu_irq pic[64];
 int n;
 
@@ -218,11 +219,12 @@ static void zynq_init(MachineState *machine)
 DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0);
 
 /* AMD */
-pflash_cfi02_register(0xe200, "zynq.pflash", FLASH_SIZE,
-  dinfo ? blk_by_legacy_dinfo(dinfo) : NULL,
-  FLASH_SECTOR_SIZE, 1,
-  1, 0x0066, 0x0022, 0x, 0x, 0x0555, 0x2aa,
-  0);
+pfl = pflash_cfi02_register("zynq.pflash", FLASH_SIZE,
+dinfo ? blk_by_legacy_dinfo(dinfo) : NULL,
+FLASH_SECTOR_SIZE, 1, 1, 0x0066, 0x0022, 
0x,
+0x, 0x0555, 0x2aa, 0);
+memory_region_add_subregion(address_space_mem, 0xe200,
+pflash_cfi02_get_memory(pfl));
 
 /* Create the main clock source, and feed slcr with it */
 zynq_machine->ps_clk = CLOCK(object_new(TYPE_CLOCK));
diff --git a/hw/block/pflash_cfi02.c b/hw/block/pflash_cfi02.c
index 

[PATCH v3 1/9] hw/block/pflash_cfi0{1, 2}: Error out if device length isn't a power of two

2022-10-16 Thread Bernhard Beschow
According to the JEDEC standard the device length is communicated to an
OS as an exponent (power of two).

Signed-off-by: Bernhard Beschow 
Reviewed-by: Bin Meng 
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/block/pflash_cfi01.c | 8 ++--
 hw/block/pflash_cfi02.c | 5 +
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/hw/block/pflash_cfi01.c b/hw/block/pflash_cfi01.c
index 0cbc2fb4cb..9c235bf66e 100644
--- a/hw/block/pflash_cfi01.c
+++ b/hw/block/pflash_cfi01.c
@@ -690,7 +690,7 @@ static const MemoryRegionOps pflash_cfi01_ops = {
 .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
-static void pflash_cfi01_fill_cfi_table(PFlashCFI01 *pfl)
+static void pflash_cfi01_fill_cfi_table(PFlashCFI01 *pfl, Error **errp)
 {
 uint64_t blocks_per_device, sector_len_per_device, device_len;
 int num_devices;
@@ -708,6 +708,10 @@ static void pflash_cfi01_fill_cfi_table(PFlashCFI01 *pfl)
 sector_len_per_device = pfl->sector_len / num_devices;
 }
 device_len = sector_len_per_device * blocks_per_device;
+if (!is_power_of_2(device_len)) {
+error_setg(errp, "Device size must be a power of two.");
+return;
+}
 
 /* Hardcoded CFI table */
 /* Standard "QRY" string */
@@ -865,7 +869,7 @@ static void pflash_cfi01_realize(DeviceState *dev, Error 
**errp)
  */
 pfl->cmd = 0x00;
 pfl->status = 0x80; /* WSM ready */
-pflash_cfi01_fill_cfi_table(pfl);
+pflash_cfi01_fill_cfi_table(pfl, errp);
 }
 
 static void pflash_cfi01_system_reset(DeviceState *dev)
diff --git a/hw/block/pflash_cfi02.c b/hw/block/pflash_cfi02.c
index 2a99b286b0..ff2fe154c1 100644
--- a/hw/block/pflash_cfi02.c
+++ b/hw/block/pflash_cfi02.c
@@ -880,6 +880,11 @@ static void pflash_cfi02_realize(DeviceState *dev, Error 
**errp)
 return;
 }
 
+if (!is_power_of_2(pfl->chip_len)) {
+error_setg(errp, "Device size must be a power of two.");
+return;
+}
+
 memory_region_init_rom_device(>orig_mem, OBJECT(pfl),
   _cfi02_ops, pfl, pfl->name,
   pfl->chip_len, errp);
-- 
2.38.0