01_scsi_timer_eh_timer_fix.patch

        scmd->eh_timeout is used to resolve the race between command
        completion and timeout.  However, during error handling,
        scsi_send_eh_cmnd uses scmd->eh_timeout.  This creates a race
        condition between eh and normal completion for a request which
        has timed out and in the process of error handling.  If the
        request completes while scmd->eh_timeout is being used by eh,
        eh timeout is lost and the command will be handled by both eh
        and completion path.  This patch fixes the race by making
        scsi_send_eh_cmnd() use its own timer.

        This patch adds shost->eh_timeout field.  The name of the
        field equals scmd->eh_timeout which is used for normal command
        timeout.  As this can be confusing, renaming scmd->eh_timeout
        to something like scmd->cmd_timeout would be good.

        Reworked such that timeout race window is kept at minimal
        level as pointed out by James Bottomley.

Signed-off-by: Tejun Heo <[EMAIL PROTECTED]>

 drivers/scsi/scsi_error.c |   26 ++++++++++++++++++++------
 include/scsi/scsi_host.h  |    1 +
 2 files changed, 21 insertions(+), 6 deletions(-)

Index: scsi-reqfn-export/drivers/scsi/scsi_error.c
===================================================================
--- scsi-reqfn-export.orig/drivers/scsi/scsi_error.c    2005-04-19 
23:28:33.000000000 +0900
+++ scsi-reqfn-export/drivers/scsi/scsi_error.c 2005-04-19 23:30:57.000000000 
+0900
@@ -428,8 +428,10 @@ static int scsi_eh_completed_normally(st
  *    for some action to complete on the device.  our only job is to
  *    record that it timed out, and to wake up the thread.
  **/
-static void scsi_eh_times_out(struct scsi_cmnd *scmd)
+static void scsi_eh_times_out(unsigned long arg)
 {
+       struct scsi_cmnd *scmd = (void *)arg;
+
        scsi_eh_eflags_set(scmd, SCSI_EH_REC_TIMEOUT);
        SCSI_LOG_ERROR_RECOVERY(3, printk("%s: scmd:%p\n", __FUNCTION__,
                                          scmd));
@@ -448,9 +450,11 @@ static void scsi_eh_done(struct scsi_cmn
         * if the timeout handler is already running, then just set the
         * flag which says we finished late, and return.  we have no
         * way of stopping the timeout handler from running, so we must
-        * always defer to it.
+        * always defer to it.  note that by doing timer removal here
+        * the window in which normally completed commands are treated
+        * as timed out is kept at minimal level.
         */
-       if (del_timer(&scmd->eh_timeout)) {
+       if (del_timer(scmd->device->host->eh_timeout)) {
                scmd->request->rq_status = RQ_SCSI_DONE;
                scmd->owner = SCSI_OWNER_ERROR_HANDLER;
 
@@ -478,6 +482,7 @@ static int scsi_send_eh_cmnd(struct scsi
 {
        struct scsi_device *sdev = scmd->device;
        struct Scsi_Host *shost = sdev->host;
+       struct timer_list timer;
        DECLARE_MUTEX_LOCKED(sem);
        unsigned long flags;
        int rtn = SUCCESS;
@@ -492,7 +497,15 @@ static int scsi_send_eh_cmnd(struct scsi
                scmd->cmnd[1] = (scmd->cmnd[1] & 0x1f) |
                        (sdev->lun << 5 & 0xe0);
 
-       scsi_add_timer(scmd, timeout, scsi_eh_times_out);
+       /*
+        * set up eh timer.
+        */
+       shost->eh_timeout = &timer;
+       init_timer(&timer);
+       timer.expires = jiffies + timeout;
+       timer.function = scsi_eh_times_out;
+       timer.data = (unsigned long)scmd;
+       add_timer(&timer);
 
        /*
         * set up the semaphore so we wait for the command to complete.
@@ -508,8 +521,6 @@ static int scsi_send_eh_cmnd(struct scsi
        down(&sem);
        scsi_log_completion(scmd, SUCCESS);
 
-       shost->eh_action = NULL;
-
        /*
         * see if timeout.  if so, tell the host to forget about it.
         * in other words, we don't want a callback any more.
@@ -539,6 +550,9 @@ static int scsi_send_eh_cmnd(struct scsi
                rtn = FAILED;
        }
 
+       shost->eh_action = NULL;
+       shost->eh_timeout = NULL;
+
        SCSI_LOG_ERROR_RECOVERY(3, printk("%s: scmd: %p, rtn:%x\n",
                                          __FUNCTION__, scmd, rtn));
 
Index: scsi-reqfn-export/include/scsi/scsi_host.h
===================================================================
--- scsi-reqfn-export.orig/include/scsi/scsi_host.h     2005-04-19 
23:28:33.000000000 +0900
+++ scsi-reqfn-export/include/scsi/scsi_host.h  2005-04-19 23:30:57.000000000 
+0900
@@ -442,6 +442,7 @@ struct Scsi_Host {
        struct completion     * eh_notify; /* wait for eh to begin or end */
        struct semaphore      * eh_action; /* Wait for specific actions on the
                                           host. */
+       struct timer_list     * eh_timeout;  /* Used to timeout eh commands */
        unsigned int            eh_active:1; /* Indicates the eh thread is 
awake and active if
                                           this is true. */
        unsigned int            eh_kill:1; /* set when killing the eh thread */

-
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to