sas_ata_strategy_handler() adds the works of the ata error handler
to system_unbound_wq. This workqueue asynchronously runs work items,
so the ata error handler will be performed concurrently on different
CPUs. In this case, ->host_failed will be decreased simultaneously in
scsi_eh_finish_cmd() on different CPUs, and become abnormal.
It will lead to permanently inequal between ->host_failed and
->host_busy, and scsi error handler thread won't become running.
IO errors after that won't be handled forever.
Use atomic type for ->host_failed to fix this race.
This fixes the problem introduced in
commit 50824d6c5657 ("[SCSI] libsas: async ata-eh").
Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Wei Fang <[email protected]>
---
drivers/ata/libata-eh.c | 2 +-
drivers/scsi/libsas/sas_scsi_host.c | 5 +++--
drivers/scsi/scsi.c | 2 +-
drivers/scsi/scsi_error.c | 15 +++++++++------
drivers/scsi/scsi_lib.c | 3 ++-
include/scsi/scsi_host.h | 2 +-
6 files changed, 17 insertions(+), 12 deletions(-)
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 961acc7..a0e7612 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -606,7 +606,7 @@ void ata_scsi_error(struct Scsi_Host *host)
ata_scsi_port_error_handler(host, ap);
/* finish or retry handled scmd's and clean up */
- WARN_ON(host->host_failed || !list_empty(&eh_work_q));
+ WARN_ON(atomic_read(&host->host_failed) || !list_empty(&eh_work_q));
DPRINTK("EXIT\n");
}
diff --git a/drivers/scsi/libsas/sas_scsi_host.c
b/drivers/scsi/libsas/sas_scsi_host.c
index 519dac4..8d74003 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -757,7 +757,8 @@ retry:
spin_unlock_irq(shost->host_lock);
SAS_DPRINTK("Enter %s busy: %d failed: %d\n",
- __func__, atomic_read(&shost->host_busy),
shost->host_failed);
+ __func__, atomic_read(&shost->host_busy),
+ atomic_read(&shost->host_failed));
/*
* Deal with commands that still have SAS tasks (i.e. they didn't
* complete via the normal sas_task completion mechanism),
@@ -800,7 +801,7 @@ out:
SAS_DPRINTK("--- Exit %s: busy: %d failed: %d tries: %d\n",
__func__, atomic_read(&shost->host_busy),
- shost->host_failed, tries);
+ atomic_read(&shost->host_failed), tries);
}
enum blk_eh_timer_return sas_scsi_timed_out(struct scsi_cmnd *cmd)
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 1deb6ad..7840915 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -527,7 +527,7 @@ void scsi_log_completion(struct scsi_cmnd *cmd, int
disposition)
scmd_printk(KERN_INFO, cmd,
"scsi host busy %d failed %d\n",
atomic_read(&cmd->device->host->host_busy),
- cmd->device->host->host_failed);
+
atomic_read(&cmd->device->host->host_failed));
}
}
}
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 984ddcb..f37666f 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -62,7 +62,8 @@ static int scsi_try_to_abort_cmd(struct scsi_host_template *,
/* called with shost->host_lock held */
void scsi_eh_wakeup(struct Scsi_Host *shost)
{
- if (atomic_read(&shost->host_busy) == shost->host_failed) {
+ if (atomic_read(&shost->host_busy) ==
+ atomic_read(&shost->host_failed)) {
trace_scsi_eh_wakeup(shost);
wake_up_process(shost->ehandler);
SCSI_LOG_ERROR_RECOVERY(5, shost_printk(KERN_INFO, shost,
@@ -250,7 +251,7 @@ int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag)
eh_flag &= ~SCSI_EH_CANCEL_CMD;
scmd->eh_eflags |= eh_flag;
list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q);
- shost->host_failed++;
+ atomic_inc(&shost->host_failed);
scsi_eh_wakeup(shost);
out_unlock:
spin_unlock_irqrestore(shost->host_lock, flags);
@@ -1127,7 +1128,7 @@ static int scsi_eh_action(struct scsi_cmnd *scmd, int rtn)
*/
void scsi_eh_finish_cmd(struct scsi_cmnd *scmd, struct list_head *done_q)
{
- scmd->device->host->host_failed--;
+ atomic_dec(&scmd->device->host->host_failed);
scmd->eh_eflags = 0;
list_move_tail(&scmd->eh_entry, done_q);
}
@@ -2190,8 +2191,10 @@ int scsi_error_handler(void *data)
if (kthread_should_stop())
break;
- if ((shost->host_failed == 0 && shost->host_eh_scheduled == 0)
||
- shost->host_failed != atomic_read(&shost->host_busy)) {
+ if ((atomic_read(&shost->host_failed) == 0 &&
+ shost->host_eh_scheduled == 0) ||
+ (atomic_read(&shost->host_failed) !=
+ atomic_read(&shost->host_busy))) {
SCSI_LOG_ERROR_RECOVERY(1,
shost_printk(KERN_INFO, shost,
"scsi_eh_%d: sleeping\n",
@@ -2205,7 +2208,7 @@ int scsi_error_handler(void *data)
shost_printk(KERN_INFO, shost,
"scsi_eh_%d: waking up %d/%d/%d\n",
shost->host_no, shost->host_eh_scheduled,
- shost->host_failed,
+ atomic_read(&shost->host_failed),
atomic_read(&shost->host_busy)));
/*
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 8106515..fb3cc5d 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -318,7 +318,8 @@ void scsi_device_unbusy(struct scsi_device *sdev)
atomic_dec(&starget->target_busy);
if (unlikely(scsi_host_in_recovery(shost) &&
- (shost->host_failed || shost->host_eh_scheduled))) {
+ (atomic_read(&shost->host_failed) ||
+ shost->host_eh_scheduled))) {
spin_lock_irqsave(shost->host_lock, flags);
scsi_eh_wakeup(shost);
spin_unlock_irqrestore(shost->host_lock, flags);
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index fcfa3d7..654435f 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -576,7 +576,7 @@ struct Scsi_Host {
atomic_t host_busy; /* commands actually active on
low-level */
atomic_t host_blocked;
- unsigned int host_failed; /* commands that failed.
+ atomic_t host_failed; /* commands that failed.
protected by host_lock */
unsigned int host_eh_scheduled; /* EH scheduled without command */
--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html