Re: [PATCH v3 1/1] virtio-blk: avoid preallocating big SGL for data

Michael S. Tsirkin Wed, 01 Sep 2021 07:50:52 -0700

On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote:
> No need to pre-allocate a big buffer for the IO SGL anymore. If a device
> has lots of deep queues, preallocation for the sg list can consume
> substantial amounts of memory. For HW virtio-blk device, nr_hw_queues
> can be 64 or 128 and each queue's depth might be 128. This means the
> resulting preallocation for the data SGLs is big.
> 
> Switch to runtime allocation for SGL for lists longer than 2 entries.
> This is the approach used by NVMe drivers so it should be reasonable for
> virtio block as well. Runtime SGL allocation has always been the case
> for the legacy I/O path so this is nothing new.
> 
> The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't
> support SG_CHAIN, use only runtime allocation for the SGL.
> 
> Re-organize the setup of the IO request to fit the new sg chain
> mechanism.
> 
> No performance degradation was seen (fio libaio engine with 16 jobs and
> 128 iodepth):
> 
> IO size      IOPs Rand Read (before/after)         IOPs Rand Write 
> (before/after)
> --------     ---------------------------------    
> ----------------------------------
> 512B          318K/316K                                    329K/325K
> 
> 4KB           323K/321K                                    353K/349K
> 
> 16KB          199K/208K                                    250K/275K
> 
> 128KB         36K/36.1K                                    39.2K/41.7K
> 
> Signed-off-by: Max Gurtovoy <mgurto...@nvidia.com>
> Reviewed-by: Israel Rukshin <isra...@nvidia.com>


Could you use something to give confidence intervals maybe?
As it is it looks like a 1-2% regression for 512B and 4KB.



> ---
> 
> changes from V2:
>  - initialize vbr->out_hdr.sector during virtblk_setup_cmd
> 
> changes from V1:
>  - Kconfig update (from Christoph)
>  - Re-order cmd setup (from Christoph)
>  - use flexible sg pointer in the cmd (from Christoph)
>  - added perf numbers to commit msg (from Feng Li)
> 
> ---
>  drivers/block/Kconfig      |   1 +
>  drivers/block/virtio_blk.c | 155 +++++++++++++++++++++++--------------
>  2 files changed, 100 insertions(+), 56 deletions(-)
> 
> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
> index 63056cfd4b62..ca25a122b8ee 100644
> --- a/drivers/block/Kconfig
> +++ b/drivers/block/Kconfig
> @@ -395,6 +395,7 @@ config XEN_BLKDEV_BACKEND
>  config VIRTIO_BLK
>       tristate "Virtio block driver"
>       depends on VIRTIO
> +     select SG_POOL
>       help
>         This is the virtual block driver for virtio.  It can be used with
>            QEMU based VMMs (like KVM or Xen).  Say Y or M.
> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
> index 9332fc4e9b31..bdd6d415bd20 100644
> --- a/drivers/block/virtio_blk.c
> +++ b/drivers/block/virtio_blk.c
> @@ -24,6 +24,12 @@
>  /* The maximum number of sg elements that fit into a virtqueue */
>  #define VIRTIO_BLK_MAX_SG_ELEMS 32768
>  
> +#ifdef CONFIG_ARCH_NO_SG_CHAIN
> +#define VIRTIO_BLK_INLINE_SG_CNT     0
> +#else
> +#define VIRTIO_BLK_INLINE_SG_CNT     2
> +#endif
> +
>  static int virtblk_queue_count_set(const char *val,
>               const struct kernel_param *kp)
>  {
> @@ -93,6 +99,7 @@ struct virtio_blk {
>  struct virtblk_req {
>       struct virtio_blk_outhdr out_hdr;
>       u8 status;
> +     struct sg_table sg_table;
>       struct scatterlist sg[];
>  };
>  
> @@ -178,15 +185,94 @@ static int virtblk_setup_discard_write_zeroes(struct 
> request *req, bool unmap)
>       return 0;
>  }
>  
> -static inline void virtblk_request_done(struct request *req)
> +static void virtblk_unmap_data(struct request *req, struct virtblk_req *vbr)
>  {
> -     struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
> +     if (blk_rq_nr_phys_segments(req))
> +             sg_free_table_chained(&vbr->sg_table,
> +                                   VIRTIO_BLK_INLINE_SG_CNT);
> +}
> +
> +static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req,
> +             struct virtblk_req *vbr)
> +{
> +     int err;
> +
> +     if (!blk_rq_nr_phys_segments(req))
> +             return 0;
> +
> +     vbr->sg_table.sgl = vbr->sg;
> +     err = sg_alloc_table_chained(&vbr->sg_table,
> +                                  blk_rq_nr_phys_segments(req),
> +                                  vbr->sg_table.sgl,
> +                                  VIRTIO_BLK_INLINE_SG_CNT);
> +     if (unlikely(err))
> +             return -ENOMEM;
>  
> +     return blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl);
> +}
> +
> +static void virtblk_cleanup_cmd(struct request *req)
> +{
>       if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
>               kfree(page_address(req->special_vec.bv_page) +
>                     req->special_vec.bv_offset);
>       }
> +}
> +
> +static int virtblk_setup_cmd(struct virtio_device *vdev, struct request *req,
> +             struct virtblk_req *vbr)
> +{
> +     bool unmap = false;
> +     u32 type;
> +
> +     vbr->out_hdr.sector = 0;
> +
> +     switch (req_op(req)) {
> +     case REQ_OP_READ:
> +             type = VIRTIO_BLK_T_IN;
> +             vbr->out_hdr.sector = cpu_to_virtio64(vdev,
> +                                                   blk_rq_pos(req));
> +             break;
> +     case REQ_OP_WRITE:
> +             type = VIRTIO_BLK_T_OUT;
> +             vbr->out_hdr.sector = cpu_to_virtio64(vdev,
> +                                                   blk_rq_pos(req));
> +             break;
> +     case REQ_OP_FLUSH:
> +             type = VIRTIO_BLK_T_FLUSH;
> +             break;
> +     case REQ_OP_DISCARD:
> +             type = VIRTIO_BLK_T_DISCARD;
> +             break;
> +     case REQ_OP_WRITE_ZEROES:
> +             type = VIRTIO_BLK_T_WRITE_ZEROES;
> +             unmap = !(req->cmd_flags & REQ_NOUNMAP);
> +             break;
> +     case REQ_OP_DRV_IN:
> +             type = VIRTIO_BLK_T_GET_ID;
> +             break;
> +     default:
> +             WARN_ON_ONCE(1);
> +             return BLK_STS_IOERR;
> +     }
>  
> +     vbr->out_hdr.type = cpu_to_virtio32(vdev, type);
> +     vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req));
> +
> +     if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
> +             if (virtblk_setup_discard_write_zeroes(req, unmap))
> +                     return BLK_STS_RESOURCE;
> +     }
> +
> +     return 0;
> +}
> +
> +static inline void virtblk_request_done(struct request *req)
> +{
> +     struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
> +
> +     virtblk_unmap_data(req, vbr);
> +     virtblk_cleanup_cmd(req);
>       blk_mq_end_request(req, virtblk_result(vbr));
>  }
>  
> @@ -244,57 +330,23 @@ static blk_status_t virtio_queue_rq(struct 
> blk_mq_hw_ctx *hctx,
>       int qid = hctx->queue_num;
>       int err;
>       bool notify = false;
> -     bool unmap = false;
> -     u32 type;
>  
>       BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
>  
> -     switch (req_op(req)) {
> -     case REQ_OP_READ:
> -     case REQ_OP_WRITE:
> -             type = 0;
> -             break;
> -     case REQ_OP_FLUSH:
> -             type = VIRTIO_BLK_T_FLUSH;
> -             break;
> -     case REQ_OP_DISCARD:
> -             type = VIRTIO_BLK_T_DISCARD;
> -             break;
> -     case REQ_OP_WRITE_ZEROES:
> -             type = VIRTIO_BLK_T_WRITE_ZEROES;
> -             unmap = !(req->cmd_flags & REQ_NOUNMAP);
> -             break;
> -     case REQ_OP_DRV_IN:
> -             type = VIRTIO_BLK_T_GET_ID;
> -             break;
> -     default:
> -             WARN_ON_ONCE(1);
> -             return BLK_STS_IOERR;
> -     }
> -
> -     vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type);
> -     vbr->out_hdr.sector = type ?
> -             0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req));
> -     vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req));
> +     err = virtblk_setup_cmd(vblk->vdev, req, vbr);
> +     if (unlikely(err))
> +             return err;
>  
>       blk_mq_start_request(req);
>  
> -     if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
> -             err = virtblk_setup_discard_write_zeroes(req, unmap);
> -             if (err)
> -                     return BLK_STS_RESOURCE;
> -     }
> -
> -     num = blk_rq_map_sg(hctx->queue, req, vbr->sg);
> -     if (num) {
> -             if (rq_data_dir(req) == WRITE)
> -                     vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, 
> VIRTIO_BLK_T_OUT);
> -             else
> -                     vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, 
> VIRTIO_BLK_T_IN);
> +     num = virtblk_map_data(hctx, req, vbr);
> +     if (unlikely(num < 0)) {
> +             virtblk_cleanup_cmd(req);
> +             return BLK_STS_RESOURCE;
>       }
>  
>       spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
> -     err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
> +     err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg_table.sgl, num);
>       if (err) {
>               virtqueue_kick(vblk->vqs[qid].vq);
>               /* Don't stop the queue if -ENOMEM: we may have failed to
> @@ -303,6 +355,8 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx 
> *hctx,
>               if (err == -ENOSPC)
>                       blk_mq_stop_hw_queue(hctx);
>               spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
> +             virtblk_unmap_data(req, vbr);
> +             virtblk_cleanup_cmd(req);
>               switch (err) {
>               case -ENOSPC:
>                       return BLK_STS_DEV_RESOURCE;
> @@ -681,16 +735,6 @@ static const struct attribute_group 
> *virtblk_attr_groups[] = {
>       NULL,
>  };
>  
> -static int virtblk_init_request(struct blk_mq_tag_set *set, struct request 
> *rq,
> -             unsigned int hctx_idx, unsigned int numa_node)
> -{
> -     struct virtio_blk *vblk = set->driver_data;
> -     struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
> -
> -     sg_init_table(vbr->sg, vblk->sg_elems);
> -     return 0;
> -}
> -
>  static int virtblk_map_queues(struct blk_mq_tag_set *set)
>  {
>       struct virtio_blk *vblk = set->driver_data;
> @@ -703,7 +747,6 @@ static const struct blk_mq_ops virtio_mq_ops = {
>       .queue_rq       = virtio_queue_rq,
>       .commit_rqs     = virtio_commit_rqs,
>       .complete       = virtblk_request_done,
> -     .init_request   = virtblk_init_request,
>       .map_queues     = virtblk_map_queues,
>  };
>  
> @@ -783,7 +826,7 @@ static int virtblk_probe(struct virtio_device *vdev)
>       vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
>       vblk->tag_set.cmd_size =
>               sizeof(struct virtblk_req) +
> -             sizeof(struct scatterlist) * sg_elems;
> +             sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT;
>       vblk->tag_set.driver_data = vblk;
>       vblk->tag_set.nr_hw_queues = vblk->num_vqs;
>  
> -- 
> 2.18.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v3 1/1] virtio-blk: avoid preallocating big SGL for data

Reply via email to