On 02/03/2026 03:21, Benjamin Marzinski wrote:
On Wed, Feb 25, 2026 at 03:36:10PM +0000, John Garry wrote:
For failover handling, we must resubmit each bio.

However, unlike NVMe, for SCSI there is no guarantee that any bio submitted
is either all or none completed.

As such, for SCSI, for failover handling we will take the approach to
just re-submit the original bio. For this clone and submit each bio.

Signed-off-by: John Garry <[email protected]>
---
  drivers/scsi/scsi_multipath.c | 51 ++++++++++++++++++++++++++++++++++-
  include/scsi/scsi_multipath.h |  1 +
  2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_multipath.c b/drivers/scsi/scsi_multipath.c
index 4b7984e7e74ba..d79a92ec0cf6c 100644
--- a/drivers/scsi/scsi_multipath.c
+++ b/drivers/scsi/scsi_multipath.c
@@ -89,6 +89,14 @@ module_param_call(iopolicy, scsi_set_iopolicy, 
scsi_get_iopolicy,
  MODULE_PARM_DESC(iopolicy,
        "Default multipath I/O policy; 'numa' (default), 'round-robin' or 
'queue-depth'");
+struct scsi_mpath_clone_bio {
+       struct bio              *master_bio;
+       struct bio              clone;
+};

If the only extra information you need for your clone bios is a pointer
to the original bio, I think you can just store that in bi_private. So
you shouldn't actually need to allocate any front pad for your bioset.

Yes, seems a decent idea


+
+#define scsi_mpath_to_master_bio(clone) \
+               container_of(clone, struct scsi_mpath_clone_bio, clone)
+
  static int scsi_mpath_unique_lun_id(struct scsi_device *sdev)
  {
        struct scsi_mpath_device *scsi_mpath_dev = sdev->scsi_mpath_dev;

@@ -260,6 +269,39 @@ static int scsi_multipath_sdev_init(struct scsi_device 
*sdev)
        return 0;
  }
+static void scsi_mpath_clone_end_io(struct bio *clone)
+{
+       struct scsi_mpath_clone_bio *scsi_mpath_clone_bio =
+                       scsi_mpath_to_master_bio(clone);
+       struct bio *master_bio = scsi_mpath_clone_bio->master_bio;
+
+       master_bio->bi_status = clone->bi_status;
+       bio_put(clone);
+       bio_endio(master_bio);
+}
+
+static struct bio *scsi_mpath_clone_bio(struct bio *bio)
+{
+       struct mpath_disk *mpath_disk = bio->bi_bdev->bd_disk->private_data;
+       struct mpath_head *mpath_head = mpath_disk->mpath_head;
+       struct scsi_mpath_clone_bio *scsi_mpath_clone_bio;
+       struct scsi_mpath_head *scsi_mpath_head = mpath_head->drvdata;
+       struct bio *clone;
+
+       clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOWAIT,
+                               &scsi_mpath_head->bio_pool);

Why use GFP_NOWAIT? It's more likely to fail than GFP_NOIO. If the bio
has REQ_NOWAIT set, I can see where you would need this, but otherwise,
I don't see why GFP_NOIO wouldn't be better here.

Seems reasonable to try GFP_NOIO. Furthermore, we really can't tolerate the clone to fail. So, if it does, we should return an error pointer here and mpath_bdev_submit_bio() should error the original bio.


+       if (!clone)
+               return NULL;
+
+       clone->bi_end_io = scsi_mpath_clone_end_io;
+
+       scsi_mpath_clone_bio = container_of(clone,
+                                       struct scsi_mpath_clone_bio, clone);
+       scsi_mpath_clone_bio->master_bio = bio;
+
+       return clone;
+}
+
  static enum mpath_iopolicy_e scsi_mpath_get_iopolicy(struct mpath_head 
*mpath_head)
  {
        struct scsi_mpath_head *scsi_mpath_head = mpath_head->drvdata;
@@ -269,6 +311,7 @@ static enum mpath_iopolicy_e scsi_mpath_get_iopolicy(struct 
mpath_head *mpath_he
struct mpath_head_template smpdt_pr = {
        .get_iopolicy = scsi_mpath_get_iopolicy,
+       .clone_bio = scsi_mpath_clone_bio,
  };
static struct scsi_mpath_head *scsi_mpath_alloc_head(void)
@@ -283,9 +326,13 @@ static struct scsi_mpath_head *scsi_mpath_alloc_head(void)
        ida_init(&scsi_mpath_head->ida);
        mutex_init(&scsi_mpath_head->lock);
+ if (bioset_init(&scsi_mpath_head->bio_pool, SCSI_MAX_QUEUE_DEPTH,
+                       offsetof(struct scsi_mpath_clone_bio, clone),
+                       BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE))

You don't need 4096 cached bios to guarantee forward progress. I don't
see why BIO_POOL_SIZE won't work fine here.

Every bio which we are sent is cloned. And SCSI_MAX_QUEUE_DEPTH is used as the cached bio size - wouldn't it make sense to cache more than 2 bios?

Also, since you are cloning
bios, they are sharing the original bio's iovecs, so you don't need
BIOSET_NEED_BVECS.


ok

thanks!

Reply via email to