On Sun, Oct 30, 2022 at 12:35:45AM -0400, Dmitry Fomichev wrote: > This patch adds support for Zoned Block Devices (ZBDs) to the kernel > virtio-blk driver. > > The patch accompanies the virtio-blk ZBD support draft that is now > being proposed for standardization. The latest version of the draft is > linked at > > https://github.com/oasis-tcs/virtio-spec/issues/143 . > > The QEMU zoned device code that implements these protocol extensions > has been developed by Sam Li and it is currently in review at the QEMU > mailing list. > > A number of virtblk request structure changes has been introduced to > accommodate the functionality that is specific to zoned block devices > and, most importantly, make room for carrying the Zoned Append sector > value from the device back to the driver along with the request status. > > The zone-specific code in the patch is heavily influenced by NVMe ZNS > code in drivers/nvme/host/zns.c, but it is simpler because the proposed > virtio ZBD draft only covers the zoned device features that are > relevant to the zoned functionality provided by Linux block layer. > > Co-developed-by: Stefan Hajnoczi <[email protected]> > Signed-off-by: Stefan Hajnoczi <[email protected]> > Signed-off-by: Dmitry Fomichev <[email protected]> > --- > drivers/block/virtio_blk.c | 438 ++++++++++++++++++++++++++++++-- > include/uapi/linux/virtio_blk.h | 105 ++++++++ > 2 files changed, 524 insertions(+), 19 deletions(-) > > diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c > index 3efe3da5f8c2..03b5302fac6e 100644 > --- a/drivers/block/virtio_blk.c > +++ b/drivers/block/virtio_blk.c > @@ -15,6 +15,7 @@ > #include <linux/blk-mq.h> > #include <linux/blk-mq-virtio.h> > #include <linux/numa.h> > +#include <linux/vmalloc.h> > #include <uapi/linux/virtio_ring.h> > > #define PART_BITS 4 > @@ -80,22 +81,51 @@ struct virtio_blk { > int num_vqs; > int io_queues[HCTX_MAX_TYPES]; > struct virtio_blk_vq *vqs; > + > + /* For zoned device */ > + unsigned int zone_sectors; > }; > > struct virtblk_req { > + /* Out header */ > struct virtio_blk_outhdr out_hdr; > - u8 status; > + > + /* In header */ > + union { > + struct { > + u8 status; > + } common; > + > + /* > + * The zone append command has an extended in header. > + * The status field in zone_append_in_hdr must always > + * be the last byte. > + */ > + struct { > + __virtio64 sector; > + u8 status; > + } zone_append; > + } in_hdr; > + > + size_t in_hdr_len; > + > struct sg_table sg_table; > struct scatterlist sg[]; > }; > > -static inline blk_status_t virtblk_result(struct virtblk_req *vbr) > +static inline blk_status_t virtblk_result(u8 status) > { > - switch (vbr->status) { > + switch (status) { > case VIRTIO_BLK_S_OK: > return BLK_STS_OK; > case VIRTIO_BLK_S_UNSUPP: > return BLK_STS_NOTSUPP; > + case VIRTIO_BLK_S_ZONE_OPEN_RESOURCE: > + return BLK_STS_ZONE_OPEN_RESOURCE; > + case VIRTIO_BLK_S_ZONE_ACTIVE_RESOURCE: > + return BLK_STS_ZONE_ACTIVE_RESOURCE; > + case VIRTIO_BLK_S_IOERR: > + case VIRTIO_BLK_S_ZONE_UNALIGNED_WP: > default: > return BLK_STS_IOERR; > } > @@ -111,11 +141,11 @@ static inline struct virtio_blk_vq > *get_virtio_blk_vq(struct blk_mq_hw_ctx *hctx > > static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr) > { > - struct scatterlist hdr, status, *sgs[3]; > + struct scatterlist out_hdr, in_hdr, *sgs[3]; > unsigned int num_out = 0, num_in = 0; > > - sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); > - sgs[num_out++] = &hdr; > + sg_init_one(&out_hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); > + sgs[num_out++] = &out_hdr; > > if (vbr->sg_table.nents) { > if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, > VIRTIO_BLK_T_OUT)) > @@ -124,8 +154,8 @@ static int virtblk_add_req(struct virtqueue *vq, struct > virtblk_req *vbr) > sgs[num_out + num_in++] = vbr->sg_table.sgl; > } > > - sg_init_one(&status, &vbr->status, sizeof(vbr->status)); > - sgs[num_out + num_in++] = &status; > + sg_init_one(&in_hdr, &vbr->in_hdr.common.status, vbr->in_hdr_len); > + sgs[num_out + num_in++] = &in_hdr; > > return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); > } > @@ -214,21 +244,22 @@ static blk_status_t virtblk_setup_cmd(struct > virtio_device *vdev, > struct request *req, > struct virtblk_req *vbr) > { > + size_t in_hdr_len = sizeof(vbr->in_hdr.common.status); > bool unmap = false; > u32 type; > + u64 sector = 0; > > - vbr->out_hdr.sector = 0; > + /* Set fields for all request types */ > + vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req)); > > switch (req_op(req)) { > case REQ_OP_READ: > type = VIRTIO_BLK_T_IN; > - vbr->out_hdr.sector = cpu_to_virtio64(vdev, > - blk_rq_pos(req)); > + sector = blk_rq_pos(req); > break; > case REQ_OP_WRITE: > type = VIRTIO_BLK_T_OUT; > - vbr->out_hdr.sector = cpu_to_virtio64(vdev, > - blk_rq_pos(req)); > + sector = blk_rq_pos(req); > break; > case REQ_OP_FLUSH: > type = VIRTIO_BLK_T_FLUSH; > @@ -243,16 +274,42 @@ static blk_status_t virtblk_setup_cmd(struct > virtio_device *vdev, > case REQ_OP_SECURE_ERASE: > type = VIRTIO_BLK_T_SECURE_ERASE; > break; > - case REQ_OP_DRV_IN: > - type = VIRTIO_BLK_T_GET_ID; > + case REQ_OP_ZONE_OPEN: > + type = VIRTIO_BLK_T_ZONE_OPEN; > + sector = blk_rq_pos(req); > break; > + case REQ_OP_ZONE_CLOSE: > + type = VIRTIO_BLK_T_ZONE_CLOSE; > + sector = blk_rq_pos(req); > + break; > + case REQ_OP_ZONE_FINISH: > + type = VIRTIO_BLK_T_ZONE_FINISH; > + sector = blk_rq_pos(req); > + break; > + case REQ_OP_ZONE_APPEND: > + type = VIRTIO_BLK_T_ZONE_APPEND; > + sector = blk_rq_pos(req); > + in_hdr_len = sizeof(vbr->in_hdr.zone_append); > + break; > + case REQ_OP_ZONE_RESET: > + type = VIRTIO_BLK_T_ZONE_RESET; > + sector = blk_rq_pos(req); > + break; > + case REQ_OP_ZONE_RESET_ALL: > + type = VIRTIO_BLK_T_ZONE_RESET_ALL; > + break; > + case REQ_OP_DRV_IN: > + /* Out header already filled in, nothing to do */ > + return 0; > default: > WARN_ON_ONCE(1); > return BLK_STS_IOERR; > } > > + /* Set fields for non-REQ_OP_DRV_IN request types */ > + vbr->in_hdr_len = in_hdr_len; > vbr->out_hdr.type = cpu_to_virtio32(vdev, type); > - vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req)); > + vbr->out_hdr.sector = cpu_to_virtio64(vdev, sector); > > if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES || > type == VIRTIO_BLK_T_SECURE_ERASE) { > @@ -263,13 +320,30 @@ static blk_status_t virtblk_setup_cmd(struct > virtio_device *vdev, > return 0; > } > > +/* > + * The status byte is always the last byte of the virtblk request > + * in-header. This helper fetches its value for all in-header formats > + * that are currently defined. > + */ > +static inline u8 virtblk_vbr_status(struct virtblk_req *vbr) > +{ > + return *((u8 *)&vbr->in_hdr + vbr->in_hdr_len - 1); > +} > + > static inline void virtblk_request_done(struct request *req) > { > struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); > + blk_status_t status = virtblk_result(virtblk_vbr_status(vbr)); > + struct virtio_blk *vblk = req->mq_hctx->queue->queuedata; > > virtblk_unmap_data(req, vbr); > virtblk_cleanup_cmd(req); > - blk_mq_end_request(req, virtblk_result(vbr)); > + > + if (req_op(req) == REQ_OP_ZONE_APPEND) > + req->__sector = virtio64_to_cpu(vblk->vdev, > + vbr->in_hdr.zone_append.sector); > + > + blk_mq_end_request(req, status); > } > > static void virtblk_done(struct virtqueue *vq) > @@ -455,6 +529,315 @@ static void virtio_queue_rqs(struct request **rqlist) > *rqlist = requeue_list; > } > > +#ifdef CONFIG_BLK_DEV_ZONED > +static void *virtblk_alloc_report_buffer(struct virtio_blk *vblk, > + unsigned int nr_zones, > + unsigned int zone_sectors, > + size_t *buflen) > +{ > + struct request_queue *q = vblk->disk->queue; > + size_t bufsize; > + void *buf; > + > + nr_zones = min_t(unsigned int, nr_zones, > + get_capacity(vblk->disk) >> ilog2(zone_sectors)); > + > + bufsize = sizeof(struct virtio_blk_zone_report) + > + nr_zones * sizeof(struct virtio_blk_zone_descriptor); > + bufsize = min_t(size_t, bufsize, > + queue_max_hw_sectors(q) << SECTOR_SHIFT); > + bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT); > + > + while (bufsize >= sizeof(struct virtio_blk_zone_report)) { > + buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); > + if (buf) { > + *buflen = bufsize; > + return buf; > + } > + bufsize >>= 1; > + } > + > + return NULL; > +} > + > +static int virtblk_submit_zone_report(struct virtio_blk *vblk, > + char *report_buf, size_t report_len, > + sector_t sector) > +{ > + struct request_queue *q = vblk->disk->queue; > + struct request *req; > + struct virtblk_req *vbr; > + int err; > + > + req = blk_mq_alloc_request(q, REQ_OP_DRV_IN, 0); > + if (IS_ERR(req)) > + return PTR_ERR(req); > + > + vbr = blk_mq_rq_to_pdu(req); > + vbr->in_hdr_len = sizeof(vbr->in_hdr.common.status); > + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, > VIRTIO_BLK_T_ZONE_REPORT); > + vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, sector); > + > + err = blk_rq_map_kern(q, req, report_buf, report_len, GFP_KERNEL); > + if (err) > + goto out; > + > + blk_execute_rq(req, false); > + err = blk_status_to_errno(virtblk_result(vbr->in_hdr.common.status)); > +out: > + blk_mq_free_request(req); > + return err; > +} > + > +static int virtblk_parse_zone(struct virtio_blk *vblk, > + struct virtio_blk_zone_descriptor *entry, > + unsigned int idx, unsigned int zone_sectors, > + report_zones_cb cb, void *data) > +{ > + struct blk_zone zone = { }; > + > + zone.start = virtio64_to_cpu(vblk->vdev, entry->z_start); > + zone.capacity = virtio64_to_cpu(vblk->vdev, entry->z_cap); > + > + switch (entry->z_type) { > + case VIRTIO_BLK_ZT_SWR: > + zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ; > + break; > + case VIRTIO_BLK_ZT_SWP: > + zone.type = BLK_ZONE_TYPE_SEQWRITE_PREF; > + break; > + case VIRTIO_BLK_ZT_CONV: > + zone.type = BLK_ZONE_TYPE_CONVENTIONAL; > + break; > + default: > + dev_err(&vblk->vdev->dev, "zone %llu: invalid type %#x\n", > + zone.start, entry->z_type); > + return -EINVAL; > + } > + > + switch (entry->z_state) { > + case VIRTIO_BLK_ZS_EMPTY: > + zone.cond = BLK_ZONE_COND_EMPTY; > + break; > + case VIRTIO_BLK_ZS_CLOSED: > + zone.cond = BLK_ZONE_COND_CLOSED; > + break; > + case VIRTIO_BLK_ZS_FULL: > + zone.cond = BLK_ZONE_COND_FULL; > + break; > + case VIRTIO_BLK_ZS_EOPEN: > + zone.cond = BLK_ZONE_COND_EXP_OPEN; > + break; > + case VIRTIO_BLK_ZS_IOPEN: > + zone.cond = BLK_ZONE_COND_IMP_OPEN; > + break; > + case VIRTIO_BLK_ZS_NOT_WP: > + zone.cond = BLK_ZONE_COND_NOT_WP; > + break; > + case VIRTIO_BLK_ZS_RDONLY: > + zone.cond = BLK_ZONE_COND_READONLY; > + break; > + case VIRTIO_BLK_ZS_OFFLINE: > + zone.cond = BLK_ZONE_COND_OFFLINE; > + break; > + default: > + dev_err(&vblk->vdev->dev, "zone %llu: invalid condition %#x\n", > + zone.start, entry->z_state); > + return -EINVAL; > + } > + > + zone.len = zone_sectors; > + if (zone.cond == BLK_ZONE_COND_FULL) > + zone.wp = zone.start + zone.len;
Is this correct on devices where the last zone != zone_sectors? Maybe it
doesn't matter?
> + else
> + zone.wp = virtio64_to_cpu(vblk->vdev, entry->z_wp);
Is a sanity check needed here? The device is not trusted and must not be
able to trigger out-of-bounds accesses based on zone.wp or similar.
> +
> + return cb(&zone, idx, data);
> +}
> +
> +static int virtblk_report_zones(struct gendisk *disk, sector_t sector,
> + unsigned int nr_zones, report_zones_cb cb,
> + void *data)
> +{
> + struct virtio_blk *vblk = disk->private_data;
> + struct virtio_blk_zone_report *report;
> + unsigned long long nz, i;
> + size_t buflen;
> + unsigned int zone_sectors = vblk->zone_sectors;
> + int ret, zone_idx = 0;
> +
> + if (WARN_ON_ONCE(!vblk->zone_sectors))
> + return -EOPNOTSUPP;
> +
> + report = virtblk_alloc_report_buffer(vblk, nr_zones,
> + zone_sectors, &buflen);
> + if (!report)
> + return -ENOMEM;
> +
> + while (zone_idx < nr_zones && sector < get_capacity(vblk->disk)) {
Why is zone_idx a signed int while nr_zones is an unsigned int?
> + memset(report, 0, buflen);
> +
> + ret = virtblk_submit_zone_report(vblk, (char *)report,
> + buflen, sector);
> + if (ret) {
> + if (ret > 0)
> + ret = -EIO;
> + goto out_free;
> + }
> + nz = min(virtio64_to_cpu(vblk->vdev, report->nr_zones),
> + (u64)nr_zones);
> + if (!nz)
> + break;
> +
> + for (i = 0; i < nz && zone_idx < nr_zones; i++) {
> + ret = virtblk_parse_zone(vblk, &report->zones[i],
> + zone_idx, zone_sectors, cb,
> data);
> + if (ret)
> + goto out_free;
> +
> + sector = virtio64_to_cpu(vblk->vdev,
> + report->zones[i].z_start) +
> + zone_sectors;
> + zone_idx++;
> + }
> + }
> +
> + if (zone_idx > 0)
> + ret = zone_idx;
> + else
> + ret = -EINVAL;
> +out_free:
> + kvfree(report);
> + return ret;
> +}
> +
> +static void virtblk_revalidate_zones(struct virtio_blk *vblk)
> +{
> + u8 model;
> +
> + if (!vblk->zone_sectors)
> + return;
> +
> + virtio_cread(vblk->vdev, struct virtio_blk_config,
> + zoned.model, &model);
model is unused? If used, then it needs to be validated like in
virtblk_probe_zoned_device().
> + if (!blk_revalidate_disk_zones(vblk->disk, NULL))
> + set_capacity_and_notify(vblk->disk, 0);
> +}
> +
> +static int virtblk_probe_zoned_device(struct virtio_device *vdev,
> + struct virtio_blk *vblk,
> + struct request_queue *q)
> +{
> + u32 v, wg;
> + u8 model;
> + int ret;
> +
> + virtio_cread(vdev, struct virtio_blk_config,
> + zoned.model, &model);
> +
> + switch (model) {
> + case VIRTIO_BLK_Z_NONE:
> + return 0;
> + case VIRTIO_BLK_Z_HM:
> + break;
> + case VIRTIO_BLK_Z_HA:
> + /*
> + * Present the host-aware device as a regular drive.
> + * TODO It is possible to add an option to make it appear
> + * in the system as a zoned drive.
> + */
> + return 0;
> + default:
> + dev_err(&vdev->dev, "unsupported zone model %d\n", model);
> + return -EINVAL;
> + }
> +
> + dev_dbg(&vdev->dev, "probing host-managed zoned device\n");
> +
> + disk_set_zoned(vblk->disk, BLK_ZONED_HM);
> + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
> +
> + virtio_cread(vdev, struct virtio_blk_config,
> + zoned.max_open_zones, &v);
> + disk_set_max_open_zones(vblk->disk, v);
> +
> + dev_dbg(&vdev->dev, "max open zones = %u\n", v);
> +
> + virtio_cread(vdev, struct virtio_blk_config,
> + zoned.max_active_zones, &v);
> + disk_set_max_active_zones(vblk->disk, v);
> + dev_dbg(&vdev->dev, "max active zones = %u\n", v);
> +
> + virtio_cread(vdev, struct virtio_blk_config,
> + zoned.write_granularity, &wg);
> + if (!wg) {
> + dev_warn(&vdev->dev, "zero write granularity reported\n");
> + return -ENODEV;
> + }
> + blk_queue_physical_block_size(q, wg);
> + blk_queue_io_min(q, wg);
> +
> + dev_dbg(&vdev->dev, "write granularity = %u\n", wg);
> +
> + /*
> + * virtio ZBD specification doesn't require zones to be a power of
> + * two sectors in size, but the code in this driver expects that.
> + */
> + virtio_cread(vdev, struct virtio_blk_config, zoned.zone_sectors,
> + &vblk->zone_sectors);
> + if (vblk->zone_sectors == 0 || !is_power_of_2(vblk->zone_sectors)) {
> + dev_err(&vdev->dev,
> + "zoned device with non power of two zone size %u\n",
> + vblk->zone_sectors);
> + return -ENODEV;
> + }
> + dev_dbg(&vdev->dev, "zone sectors = %u\n", vblk->zone_sectors);
> +
> + if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
> + dev_warn(&vblk->vdev->dev,
> + "ignoring negotiated F_DISCARD for zoned device\n");
> + blk_queue_max_discard_sectors(q, 0);
> + }
> +
> + ret = blk_revalidate_disk_zones(vblk->disk, NULL);
> + if (!ret) {
> + virtio_cread(vdev, struct virtio_blk_config,
> + zoned.max_append_sectors, &v);
> + if (!v) {
> + dev_warn(&vdev->dev, "zero max_append_sectors
> reported\n");
> + return -ENODEV;
> + }
> + if ((v << SECTOR_SHIFT) < wg) {
> + dev_err(&vdev->dev,
> + "write granularity %u exceeds
> max_append_sectors %u limit\n",
> + wg, v);
> + return -ENODEV;
> + }
> +
> + blk_queue_max_zone_append_sectors(q, v);
> + dev_dbg(&vdev->dev, "max append sectors = %u\n", v);
> + }
> +
> + return ret;
> +}
> +
> +#else
> +
> +/*
> + * Zoned block device support is not configured in this kernel.
> + * We only need to define a few symbols to avoid compilation errors.
> + */
> +#define virtblk_report_zones NULL
> +static inline void virtblk_revalidate_zones(struct virtio_blk *vblk)
> +{
> +}
> +static inline int virtblk_probe_zoned_device(struct virtio_device *vdev,
> + struct virtio_blk *vblk, struct request_queue *q)
> +{
> + return -EOPNOTSUPP;
> +}
> +#endif /* CONFIG_BLK_DEV_ZONED */
> +
> /* return id (s/n) string for *disk to *id_str
> */
> static int virtblk_get_id(struct gendisk *disk, char *id_str)
> @@ -462,18 +845,24 @@ static int virtblk_get_id(struct gendisk *disk, char
> *id_str)
> struct virtio_blk *vblk = disk->private_data;
> struct request_queue *q = vblk->disk->queue;
> struct request *req;
> + struct virtblk_req *vbr;
> int err;
>
> req = blk_mq_alloc_request(q, REQ_OP_DRV_IN, 0);
> if (IS_ERR(req))
> return PTR_ERR(req);
>
> + vbr = blk_mq_rq_to_pdu(req);
> + vbr->in_hdr_len = sizeof(vbr->in_hdr.common.status);
> + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
> + vbr->out_hdr.sector = 0;
> +
> err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
> if (err)
> goto out;
>
> blk_execute_rq(req, false);
> - err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req)));
> + err = blk_status_to_errno(virtblk_result(vbr->in_hdr.common.status));
> out:
> blk_mq_free_request(req);
> return err;
> @@ -524,6 +913,7 @@ static const struct block_device_operations virtblk_fops
> = {
> .owner = THIS_MODULE,
> .getgeo = virtblk_getgeo,
> .free_disk = virtblk_free_disk,
> + .report_zones = virtblk_report_zones,
Does virtblk_report_zones() need:
mutex_lock(&vblk->vdev_mutex);
if (!vblk->vdev) {
ret = -ENXIO;
goto out;
}
...
mutex_unlock(&vblk->vdev_mutex);
return ret;
?
This is necessary when vblk->vdev is access by code that is reachable
after virtblk_remove() has been called (e.g. PCI hot unplug). In this
case the block device still exists and userspace processes may have file
descriptors open, but the underlying virtio-blk device is gone.
I think this may be necessary since ioctl(BLKREPORTZONE) can still be
called after virtblk_remove()?
See the vdev_mutex field doc comment for more information.
Stefan
signature.asc
Description: PGP signature
