host_failed

Wei Fang Tue, 31 May 2016 01:37:23 -0700

sas_ata_strategy_handler() adds the works of the ata error handler
to system_unbound_wq. This workqueue asynchronously runs work items,
so the ata error handler will be performed concurrently on different
CPUs. In this case, ->host_failed will be decreased simultaneously in
scsi_eh_finish_cmd() on different CPUs, and become abnormal.


It will lead to permanently inequal between ->host_failed and
 ->host_busy, and scsi error handler thread won't become running.
IO errors after that won't be handled forever.

Use atomic type for ->host_failed to fix this race.

This fixes the problem introduced in
commit 50824d6c5657 ("[SCSI] libsas: async ata-eh").

Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Wei Fang <[email protected]>
---
 drivers/ata/libata-eh.c             |    2 +-
 drivers/scsi/libsas/sas_scsi_host.c |    5 +++--
 drivers/scsi/scsi.c                 |    2 +-
 drivers/scsi/scsi_error.c           |   15 +++++++++------
 drivers/scsi/scsi_lib.c             |    3 ++-
 include/scsi/scsi_host.h            |    2 +-
 6 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 961acc7..a0e7612 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -606,7 +606,7 @@ void ata_scsi_error(struct Scsi_Host *host)
        ata_scsi_port_error_handler(host, ap);
 
        /* finish or retry handled scmd's and clean up */
-       WARN_ON(host->host_failed || !list_empty(&eh_work_q));
+       WARN_ON(atomic_read(&host->host_failed) || !list_empty(&eh_work_q));
 
        DPRINTK("EXIT\n");
 }
diff --git a/drivers/scsi/libsas/sas_scsi_host.c 
b/drivers/scsi/libsas/sas_scsi_host.c
index 519dac4..8d74003 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -757,7 +757,8 @@ retry:
        spin_unlock_irq(shost->host_lock);
 
        SAS_DPRINTK("Enter %s busy: %d failed: %d\n",
-                   __func__, atomic_read(&shost->host_busy), 
shost->host_failed);
+                   __func__, atomic_read(&shost->host_busy),
+                   atomic_read(&shost->host_failed));
        /*
         * Deal with commands that still have SAS tasks (i.e. they didn't
         * complete via the normal sas_task completion mechanism),
@@ -800,7 +801,7 @@ out:
 
        SAS_DPRINTK("--- Exit %s: busy: %d failed: %d tries: %d\n",
                    __func__, atomic_read(&shost->host_busy),
-                   shost->host_failed, tries);
+                   atomic_read(&shost->host_failed), tries);
 }
 
 enum blk_eh_timer_return sas_scsi_timed_out(struct scsi_cmnd *cmd)
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 1deb6ad..7840915 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -527,7 +527,7 @@ void scsi_log_completion(struct scsi_cmnd *cmd, int 
disposition)
                                scmd_printk(KERN_INFO, cmd,
                                            "scsi host busy %d failed %d\n",
                                            
atomic_read(&cmd->device->host->host_busy),
-                                           cmd->device->host->host_failed);
+                                           
atomic_read(&cmd->device->host->host_failed));
                }
        }
 }
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 984ddcb..f37666f 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -62,7 +62,8 @@ static int scsi_try_to_abort_cmd(struct scsi_host_template *,
 /* called with shost->host_lock held */
 void scsi_eh_wakeup(struct Scsi_Host *shost)
 {
-       if (atomic_read(&shost->host_busy) == shost->host_failed) {
+       if (atomic_read(&shost->host_busy) ==
+           atomic_read(&shost->host_failed)) {
                trace_scsi_eh_wakeup(shost);
                wake_up_process(shost->ehandler);
                SCSI_LOG_ERROR_RECOVERY(5, shost_printk(KERN_INFO, shost,
@@ -250,7 +251,7 @@ int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag)
                eh_flag &= ~SCSI_EH_CANCEL_CMD;
        scmd->eh_eflags |= eh_flag;
        list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q);
-       shost->host_failed++;
+       atomic_inc(&shost->host_failed);
        scsi_eh_wakeup(shost);
  out_unlock:
        spin_unlock_irqrestore(shost->host_lock, flags);
@@ -1127,7 +1128,7 @@ static int scsi_eh_action(struct scsi_cmnd *scmd, int rtn)
  */
 void scsi_eh_finish_cmd(struct scsi_cmnd *scmd, struct list_head *done_q)
 {
-       scmd->device->host->host_failed--;
+       atomic_dec(&scmd->device->host->host_failed);
        scmd->eh_eflags = 0;
        list_move_tail(&scmd->eh_entry, done_q);
 }
@@ -2190,8 +2191,10 @@ int scsi_error_handler(void *data)
                if (kthread_should_stop())
                        break;
 
-               if ((shost->host_failed == 0 && shost->host_eh_scheduled == 0) 
||
-                   shost->host_failed != atomic_read(&shost->host_busy)) {
+               if ((atomic_read(&shost->host_failed) == 0 &&
+                    shost->host_eh_scheduled == 0) ||
+                   (atomic_read(&shost->host_failed) !=
+                    atomic_read(&shost->host_busy))) {
                        SCSI_LOG_ERROR_RECOVERY(1,
                                shost_printk(KERN_INFO, shost,
                                             "scsi_eh_%d: sleeping\n",
@@ -2205,7 +2208,7 @@ int scsi_error_handler(void *data)
                        shost_printk(KERN_INFO, shost,
                                     "scsi_eh_%d: waking up %d/%d/%d\n",
                                     shost->host_no, shost->host_eh_scheduled,
-                                    shost->host_failed,
+                                    atomic_read(&shost->host_failed),
                                     atomic_read(&shost->host_busy)));
 
                /*
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 8106515..fb3cc5d 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -318,7 +318,8 @@ void scsi_device_unbusy(struct scsi_device *sdev)
                atomic_dec(&starget->target_busy);
 
        if (unlikely(scsi_host_in_recovery(shost) &&
-                    (shost->host_failed || shost->host_eh_scheduled))) {
+                    (atomic_read(&shost->host_failed) ||
+                     shost->host_eh_scheduled))) {
                spin_lock_irqsave(shost->host_lock, flags);
                scsi_eh_wakeup(shost);
                spin_unlock_irqrestore(shost->host_lock, flags);
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index fcfa3d7..654435f 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -576,7 +576,7 @@ struct Scsi_Host {
        atomic_t host_busy;                /* commands actually active on 
low-level */
        atomic_t host_blocked;
 
-       unsigned int host_failed;          /* commands that failed.
+       atomic_t host_failed;              /* commands that failed.
                                              protected by host_lock */
        unsigned int host_eh_scheduled;    /* EH scheduled without command */
     
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2 1/2] scsi: fix race between simultaneous decrements of ->host_failed

Reply via email to