On Tue, 2017-11-14 at 18:01 +0100, Jack Wang wrote:
> I suspect we run into same bug you were trying to fix in this patch
> set. we're running in v4.4.50
> 
> I was trying to reproduce it, but no lucky yet, do you still have your
> reproducer?

Hello Jack,

I can reproduce this about every fifth run of test one of the srp-test
software and with the SRP initiator and target drivers of what will become
kernel v4.15-rc1 and by switching the ib_srpt driver from non-SRQ to SRQ
mode while the initiator is logging in. I'm currently analyzing where in the
block layer a queue run is missing. The patch below for the sd driver does
not fix the root cause but seems to help.

Bart.


Subject: [PATCH] Increase SCSI disk probing concurrency

---
 drivers/scsi/scsi.c        |  5 -----
 drivers/scsi/scsi_pm.c     |  6 ++++--
 drivers/scsi/scsi_priv.h   |  1 -
 drivers/scsi/sd.c          | 26 +++++++++++++++++++++-----
 drivers/scsi/sd.h          |  1 +
 include/scsi/scsi_driver.h |  1 +
 6 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index a7e4fba724b7..e6d69e647f6a 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -85,10 +85,6 @@ unsigned int scsi_logging_level;
 EXPORT_SYMBOL(scsi_logging_level);
 #endif
 
-/* sd, scsi core and power management need to coordinate flushing async 
actions */
-ASYNC_DOMAIN(scsi_sd_probe_domain);
-EXPORT_SYMBOL(scsi_sd_probe_domain);
-
 /*
  * Separate domain (from scsi_sd_probe_domain) to maximize the benefit of
  * asynchronous system resume operations.  It is marked 'exclusive' to avoid
@@ -839,7 +835,6 @@ static void __exit exit_scsi(void)
        scsi_exit_devinfo();
        scsi_exit_procfs();
        scsi_exit_queue();
-       async_unregister_domain(&scsi_sd_probe_domain);
 }
 
 subsys_initcall(init_scsi);
diff --git a/drivers/scsi/scsi_pm.c b/drivers/scsi/scsi_pm.c
index b44c1bb687a2..d8e43c2f4d40 100644
--- a/drivers/scsi/scsi_pm.c
+++ b/drivers/scsi/scsi_pm.c
@@ -171,9 +171,11 @@ static int scsi_bus_resume_common(struct device *dev,
 static int scsi_bus_prepare(struct device *dev)
 {
        if (scsi_is_sdev_device(dev)) {
-               /* sd probing uses async_schedule.  Wait until it finishes. */
-               async_synchronize_full_domain(&scsi_sd_probe_domain);
+               struct scsi_driver *drv = to_scsi_driver(dev->driver);
 
+               /* sd probing happens asynchronously. Wait until it finishes. */
+               if (drv->sync)
+                       drv->sync(dev);
        } else if (scsi_is_host_device(dev)) {
                /* Wait until async scanning is finished */
                scsi_complete_async_scans();
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index dab29f538612..bf0cadf6a321 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -174,7 +174,6 @@ static inline void scsi_autopm_put_host(struct Scsi_Host 
*h) {}
 #endif /* CONFIG_PM */
 
 extern struct async_domain scsi_sd_pm_domain;
-extern struct async_domain scsi_sd_probe_domain;
 
 /* scsi_dh.c */
 #ifdef CONFIG_SCSI_DH
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 0313486d85c8..c26dbb38b60c 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -112,6 +112,7 @@ static void sd_shutdown(struct device *);
 static int sd_suspend_system(struct device *);
 static int sd_suspend_runtime(struct device *);
 static int sd_resume(struct device *);
+static void sd_sync_probe_domain(struct device *dev);
 static void sd_rescan(struct device *);
 static int sd_init_command(struct scsi_cmnd *SCpnt);
 static void sd_uninit_command(struct scsi_cmnd *SCpnt);
@@ -564,6 +565,7 @@ static struct scsi_driver sd_template = {
                .shutdown       = sd_shutdown,
                .pm             = &sd_pm_ops,
        },
+       .sync                   = sd_sync_probe_domain,
        .rescan                 = sd_rescan,
        .init_command           = sd_init_command,
        .uninit_command         = sd_uninit_command,
@@ -3221,9 +3223,9 @@ static int sd_format_disk_name(char *prefix, int index, 
char *buf, int buflen)
 /*
  * The asynchronous part of sd_probe
  */
-static void sd_probe_async(void *data, async_cookie_t cookie)
+static void sd_probe_async(struct work_struct *work)
 {
-       struct scsi_disk *sdkp = data;
+       struct scsi_disk *sdkp = container_of(work, typeof(*sdkp), probe_work);
        struct scsi_device *sdp;
        struct gendisk *gd;
        u32 index;
@@ -3326,6 +3328,8 @@ static int sd_probe(struct device *dev)
        if (!sdkp)
                goto out;
 
+       INIT_WORK(&sdkp->probe_work, sd_probe_async);
+
        gd = alloc_disk(SD_MINORS);
        if (!gd)
                goto out_free;
@@ -3377,8 +3381,8 @@ static int sd_probe(struct device *dev)
        get_device(dev);
        dev_set_drvdata(dev, sdkp);
 
-       get_device(&sdkp->dev); /* prevent release before async_schedule */
-       async_schedule_domain(sd_probe_async, sdkp, &scsi_sd_probe_domain);
+       get_device(&sdkp->dev); /* prevent release before sd_probe_async() */
+       WARN_ON_ONCE(!queue_work(system_unbound_wq, &sdkp->probe_work));
 
        return 0;
 
@@ -3395,6 +3399,18 @@ static int sd_probe(struct device *dev)
        return error;
 }
 
+static void sd_wait_for_probing(struct scsi_disk *sdkp)
+{
+       flush_work(&sdkp->probe_work);
+}
+
+static void sd_sync_probe_domain(struct device *dev)
+{
+       struct scsi_disk *sdkp = dev_get_drvdata(dev);
+
+       sd_wait_for_probing(sdkp);
+}
+
 /**
  *     sd_remove - called whenever a scsi disk (previously recognized by
  *     sd_probe) is detached from the system. It is called (potentially
@@ -3416,7 +3432,7 @@ static int sd_remove(struct device *dev)
        scsi_autopm_get_device(sdkp->device);
 
        async_synchronize_full_domain(&scsi_sd_pm_domain);
-       async_synchronize_full_domain(&scsi_sd_probe_domain);
+       sd_wait_for_probing(sdkp);
        device_del(&sdkp->dev);
        del_gendisk(sdkp->disk);
        sd_shutdown(dev);
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 7b57dafcd45a..2cc47183c9aa 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -81,6 +81,7 @@ struct scsi_disk {
        unsigned int    zones_optimal_nonseq;
        unsigned int    zones_max_open;
 #endif
+       struct work_struct probe_work;
        atomic_t        openers;
        sector_t        capacity;       /* size in logical blocks */
        u32             max_xfer_blocks;
diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h
index a5534ccad859..145d6239eecf 100644
--- a/include/scsi/scsi_driver.h
+++ b/include/scsi/scsi_driver.h
@@ -11,6 +11,7 @@ struct scsi_device;
 struct scsi_driver {
        struct device_driver    gendrv;
 
+       void (*sync)(struct device *);
        void (*rescan)(struct device *);
        int (*init_command)(struct scsi_cmnd *);
        void (*uninit_command)(struct scsi_cmnd *);
-- 
2.15.0

Reply via email to