On Tue, 2017-11-14 at 18:01 +0100, Jack Wang wrote:
> I suspect we run into same bug you were trying to fix in this patch
> set. we're running in v4.4.50
>
> I was trying to reproduce it, but no lucky yet, do you still have your
> reproducer?
Hello Jack,
I can reproduce this about every fifth run of test one of the srp-test
software and with the SRP initiator and target drivers of what will become
kernel v4.15-rc1 and by switching the ib_srpt driver from non-SRQ to SRQ
mode while the initiator is logging in. I'm currently analyzing where in the
block layer a queue run is missing. The patch below for the sd driver does
not fix the root cause but seems to help.
Bart.
Subject: [PATCH] Increase SCSI disk probing concurrency
---
drivers/scsi/scsi.c | 5 -----
drivers/scsi/scsi_pm.c | 6 ++++--
drivers/scsi/scsi_priv.h | 1 -
drivers/scsi/sd.c | 26 +++++++++++++++++++++-----
drivers/scsi/sd.h | 1 +
include/scsi/scsi_driver.h | 1 +
6 files changed, 27 insertions(+), 13 deletions(-)
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index a7e4fba724b7..e6d69e647f6a 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -85,10 +85,6 @@ unsigned int scsi_logging_level;
EXPORT_SYMBOL(scsi_logging_level);
#endif
-/* sd, scsi core and power management need to coordinate flushing async
actions */
-ASYNC_DOMAIN(scsi_sd_probe_domain);
-EXPORT_SYMBOL(scsi_sd_probe_domain);
-
/*
* Separate domain (from scsi_sd_probe_domain) to maximize the benefit of
* asynchronous system resume operations. It is marked 'exclusive' to avoid
@@ -839,7 +835,6 @@ static void __exit exit_scsi(void)
scsi_exit_devinfo();
scsi_exit_procfs();
scsi_exit_queue();
- async_unregister_domain(&scsi_sd_probe_domain);
}
subsys_initcall(init_scsi);
diff --git a/drivers/scsi/scsi_pm.c b/drivers/scsi/scsi_pm.c
index b44c1bb687a2..d8e43c2f4d40 100644
--- a/drivers/scsi/scsi_pm.c
+++ b/drivers/scsi/scsi_pm.c
@@ -171,9 +171,11 @@ static int scsi_bus_resume_common(struct device *dev,
static int scsi_bus_prepare(struct device *dev)
{
if (scsi_is_sdev_device(dev)) {
- /* sd probing uses async_schedule. Wait until it finishes. */
- async_synchronize_full_domain(&scsi_sd_probe_domain);
+ struct scsi_driver *drv = to_scsi_driver(dev->driver);
+ /* sd probing happens asynchronously. Wait until it finishes. */
+ if (drv->sync)
+ drv->sync(dev);
} else if (scsi_is_host_device(dev)) {
/* Wait until async scanning is finished */
scsi_complete_async_scans();
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index dab29f538612..bf0cadf6a321 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -174,7 +174,6 @@ static inline void scsi_autopm_put_host(struct Scsi_Host
*h) {}
#endif /* CONFIG_PM */
extern struct async_domain scsi_sd_pm_domain;
-extern struct async_domain scsi_sd_probe_domain;
/* scsi_dh.c */
#ifdef CONFIG_SCSI_DH
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 0313486d85c8..c26dbb38b60c 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -112,6 +112,7 @@ static void sd_shutdown(struct device *);
static int sd_suspend_system(struct device *);
static int sd_suspend_runtime(struct device *);
static int sd_resume(struct device *);
+static void sd_sync_probe_domain(struct device *dev);
static void sd_rescan(struct device *);
static int sd_init_command(struct scsi_cmnd *SCpnt);
static void sd_uninit_command(struct scsi_cmnd *SCpnt);
@@ -564,6 +565,7 @@ static struct scsi_driver sd_template = {
.shutdown = sd_shutdown,
.pm = &sd_pm_ops,
},
+ .sync = sd_sync_probe_domain,
.rescan = sd_rescan,
.init_command = sd_init_command,
.uninit_command = sd_uninit_command,
@@ -3221,9 +3223,9 @@ static int sd_format_disk_name(char *prefix, int index,
char *buf, int buflen)
/*
* The asynchronous part of sd_probe
*/
-static void sd_probe_async(void *data, async_cookie_t cookie)
+static void sd_probe_async(struct work_struct *work)
{
- struct scsi_disk *sdkp = data;
+ struct scsi_disk *sdkp = container_of(work, typeof(*sdkp), probe_work);
struct scsi_device *sdp;
struct gendisk *gd;
u32 index;
@@ -3326,6 +3328,8 @@ static int sd_probe(struct device *dev)
if (!sdkp)
goto out;
+ INIT_WORK(&sdkp->probe_work, sd_probe_async);
+
gd = alloc_disk(SD_MINORS);
if (!gd)
goto out_free;
@@ -3377,8 +3381,8 @@ static int sd_probe(struct device *dev)
get_device(dev);
dev_set_drvdata(dev, sdkp);
- get_device(&sdkp->dev); /* prevent release before async_schedule */
- async_schedule_domain(sd_probe_async, sdkp, &scsi_sd_probe_domain);
+ get_device(&sdkp->dev); /* prevent release before sd_probe_async() */
+ WARN_ON_ONCE(!queue_work(system_unbound_wq, &sdkp->probe_work));
return 0;
@@ -3395,6 +3399,18 @@ static int sd_probe(struct device *dev)
return error;
}
+static void sd_wait_for_probing(struct scsi_disk *sdkp)
+{
+ flush_work(&sdkp->probe_work);
+}
+
+static void sd_sync_probe_domain(struct device *dev)
+{
+ struct scsi_disk *sdkp = dev_get_drvdata(dev);
+
+ sd_wait_for_probing(sdkp);
+}
+
/**
* sd_remove - called whenever a scsi disk (previously recognized by
* sd_probe) is detached from the system. It is called (potentially
@@ -3416,7 +3432,7 @@ static int sd_remove(struct device *dev)
scsi_autopm_get_device(sdkp->device);
async_synchronize_full_domain(&scsi_sd_pm_domain);
- async_synchronize_full_domain(&scsi_sd_probe_domain);
+ sd_wait_for_probing(sdkp);
device_del(&sdkp->dev);
del_gendisk(sdkp->disk);
sd_shutdown(dev);
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 7b57dafcd45a..2cc47183c9aa 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -81,6 +81,7 @@ struct scsi_disk {
unsigned int zones_optimal_nonseq;
unsigned int zones_max_open;
#endif
+ struct work_struct probe_work;
atomic_t openers;
sector_t capacity; /* size in logical blocks */
u32 max_xfer_blocks;
diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h
index a5534ccad859..145d6239eecf 100644
--- a/include/scsi/scsi_driver.h
+++ b/include/scsi/scsi_driver.h
@@ -11,6 +11,7 @@ struct scsi_device;
struct scsi_driver {
struct device_driver gendrv;
+ void (*sync)(struct device *);
void (*rescan)(struct device *);
int (*init_command)(struct scsi_cmnd *);
void (*uninit_command)(struct scsi_cmnd *);
--
2.15.0