Re: [PATCH 3/4] scsi: improved eh timeout handler

Jörn Engel Thu, 06 Jun 2013 11:03:17 -0700

On Thu, 6 June 2013 11:43:54 +0200, Hannes Reinecke wrote:
> 
> When a command runs into a timeout we need to send an 'ABORT TASK'
> TMF. This is typically done by the 'eh_abort_handler' LLDD callback.
> 
> Conceptually, however, this function is a normal SCSI command, so
> there is no need to enter the error handler.
> 
> This patch implements a new scsi_abort_command() function which
> invokes an asynchronous function scsi_eh_abort_handler() to
> abort the commands via 'eh_abort_handler'.
> 
> If the 'eh_abort_handler' returns SUCCESS or FAST_IO_FAIL the
> command will be retried if possible. If no retries are allowed
> the command will be returned immediately, as we have to assume
> the TMF succeeded and the command is completed with the LLDD.
> For any other return code from 'eh_abort_handler' the command
> will be pushed onto the existing SCSI EH handler, or aborted
> with DID_TIME_OUT if that fails.
> 
> Signed-off-by: Hannes Reinecke <[email protected]>
> ---
>  drivers/scsi/scsi_error.c        | 79 
> ++++++++++++++++++++++++++++++++++++++++
>  drivers/scsi/scsi_scan.c         |  3 ++
>  drivers/scsi/scsi_transport_fc.c |  3 +-
>  include/scsi/scsi_cmnd.h         |  1 +
>  include/scsi/scsi_device.h       |  2 +
>  5 files changed, 87 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
> index 96b4bb6..0a6b21c 100644
> --- a/drivers/scsi/scsi_error.c
> +++ b/drivers/scsi/scsi_error.c
> @@ -55,6 +55,8 @@ static void scsi_eh_done(struct scsi_cmnd *scmd);
>  #define HOST_RESET_SETTLE_TIME  (10)
>  
>  static int scsi_eh_try_stu(struct scsi_cmnd *scmd);
> +static int scsi_try_to_abort_cmd(struct scsi_host_template *hostt,
> +                              struct scsi_cmnd *scmd);
>  
>  /* called with shost->host_lock held */
>  void scsi_eh_wakeup(struct Scsi_Host *shost)
> @@ -90,6 +92,83 @@ void scsi_schedule_eh(struct Scsi_Host *shost)
>  EXPORT_SYMBOL_GPL(scsi_schedule_eh);
>  
>  /**
> + * scsi_eh_abort_handler - Handle command aborts
> + * @work:    sdev on which commands should be aborted.
> + */
> +void
> +scsi_eh_abort_handler(struct work_struct *work)
> +{
> +     struct scsi_device *sdev =
> +             container_of(work, struct scsi_device, abort_work);
> +     struct Scsi_Host *shost = sdev->host;
> +     struct scsi_cmnd *scmd, *tmp;
> +     unsigned long flags;
> +     int rtn;
> +
> +     spin_lock_irqsave(&sdev->list_lock, flags);
> +     list_for_each_entry_safe(scmd, tmp, &sdev->eh_abort_list, eh_entry) {
> +             list_del_init(&scmd->eh_entry);


The _init bit is not needed.  I prefer list_del, as the poisoning
sometimes helps catching bugs.

> +             spin_unlock_irqrestore(&sdev->list_lock, flags);
> +             SCSI_LOG_ERROR_RECOVERY(3,
> +                     scmd_printk(KERN_INFO, scmd,
> +                                 "aborting command %p\n", scmd));
> +             rtn = scsi_try_to_abort_cmd(shost->hostt, scmd);
> +             if (rtn == SUCCESS || rtn == FAST_IO_FAIL) {
> +                     if (((scmd->request->cmd_flags & REQ_FAILFAST_DEV) ||

Am I being stupid again or should this be negated?

> +                          (scmd->request->cmd_type == REQ_TYPE_BLOCK_PC)) &&
> +                         (++scmd->retries <= scmd->allowed)) {
> +                             SCSI_LOG_ERROR_RECOVERY(3,
> +                                     scmd_printk(KERN_WARNING, scmd,
> +                                                 "retry aborted command\n"));
> +
> +                             scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
> +                     } else {
> +                             SCSI_LOG_ERROR_RECOVERY(3,
> +                                     scmd_printk(KERN_WARNING, scmd,
> +                                                 "fast fail aborted 
> command\n"));
> +                             scmd->result |= DID_TRANSPORT_FAILFAST << 16;
> +                             scsi_finish_command(scmd);
> +                     }
> +             } else {
> +                     if (!scsi_eh_scmd_add(scmd, 0)) {
> +                             SCSI_LOG_ERROR_RECOVERY(3,
> +                                     scmd_printk(KERN_WARNING, scmd,
> +                                                 "terminate aborted 
> command\n"));
> +                             scmd->result |= DID_TIME_OUT << 16;
> +                             scsi_finish_command(scmd);
> +                     }
> +             }
> +             spin_lock_irqsave(&sdev->list_lock, flags);
> +     }
> +     spin_unlock_irqrestore(&sdev->list_lock, flags);
> +}
> +
> +/**
> + * scsi_abort_command - schedule a command abort
> + * @scmd:    scmd to abort.
> + *
> + * We only need to abort commands after a command timeout
> + */
> +void
> +scsi_abort_command(struct scsi_cmnd *scmd)
> +{
> +     unsigned long flags;
> +     int kick_worker = 0;
> +     struct scsi_device *sdev = scmd->device;
> +
> +     spin_lock_irqsave(&sdev->list_lock, flags);
> +     if (list_empty(&sdev->eh_abort_list))
> +             kick_worker = 1;
> +     list_add(&scmd->eh_entry, &sdev->eh_abort_list);
> +     SCSI_LOG_ERROR_RECOVERY(3,
> +             scmd_printk(KERN_INFO, scmd, "adding to eh_abort_list\n"));

The printk can be moved outside the spinlock.  Who knows, maybe this
will become a scalability bottleneck before the millenium is over.

> +     spin_unlock_irqrestore(&sdev->list_lock, flags);
> +     if (kick_worker)
> +             schedule_work(&sdev->abort_work);
> +}
> +EXPORT_SYMBOL_GPL(scsi_abort_command);
> +
> +/**
>   * scsi_eh_scmd_add - add scsi cmd to error handling.
>   * @scmd:    scmd to run eh on.
>   * @eh_flag: optional SCSI_EH flag.
> diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
> index 3e58b22..f9cc6fc 100644
> --- a/drivers/scsi/scsi_scan.c
> +++ b/drivers/scsi/scsi_scan.c
> @@ -231,6 +231,7 @@ static struct scsi_device *scsi_alloc_sdev(struct 
> scsi_target *starget,
>       struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);
>       extern void scsi_evt_thread(struct work_struct *work);
>       extern void scsi_requeue_run_queue(struct work_struct *work);
> +     extern void scsi_eh_abort_handler(struct work_struct *work);

Function declarations in a .c file?  Ick!

>  
>       sdev = kzalloc(sizeof(*sdev) + shost->transportt->device_size,
>                      GFP_ATOMIC);
> @@ -251,9 +252,11 @@ static struct scsi_device *scsi_alloc_sdev(struct 
> scsi_target *starget,
>       INIT_LIST_HEAD(&sdev->cmd_list);
>       INIT_LIST_HEAD(&sdev->starved_entry);
>       INIT_LIST_HEAD(&sdev->event_list);
> +     INIT_LIST_HEAD(&sdev->eh_abort_list);
>       spin_lock_init(&sdev->list_lock);
>       INIT_WORK(&sdev->event_work, scsi_evt_thread);
>       INIT_WORK(&sdev->requeue_work, scsi_requeue_run_queue);
> +     INIT_WORK(&sdev->abort_work, scsi_eh_abort_handler);
>  
>       sdev->sdev_gendev.parent = get_device(&starget->dev);
>       sdev->sdev_target = starget;
> diff --git a/drivers/scsi/scsi_transport_fc.c 
> b/drivers/scsi/scsi_transport_fc.c
> index e106c27..09237bf 100644
> --- a/drivers/scsi/scsi_transport_fc.c
> +++ b/drivers/scsi/scsi_transport_fc.c
> @@ -2079,7 +2079,8 @@ fc_timed_out(struct scsi_cmnd *scmd)
>       if (rport->port_state == FC_PORTSTATE_BLOCKED)
>               return BLK_EH_RESET_TIMER;
>  
> -     return BLK_EH_NOT_HANDLED;
> +     scsi_abort_command(scmd);
> +     return BLK_EH_SCHEDULED;
>  }
>  
>  /*
> diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
> index de5f5d8..460e514 100644
> --- a/include/scsi/scsi_cmnd.h
> +++ b/include/scsi/scsi_cmnd.h
> @@ -144,6 +144,7 @@ extern void scsi_put_command(struct scsi_cmnd *);
>  extern void __scsi_put_command(struct Scsi_Host *, struct scsi_cmnd *,
>                              struct device *);
>  extern void scsi_finish_command(struct scsi_cmnd *cmd);
> +extern void scsi_abort_command(struct scsi_cmnd *cmd);
>  
>  extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count,
>                                size_t *offset, size_t *len);
> diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
> index cc64587..e03d379 100644
> --- a/include/scsi/scsi_device.h
> +++ b/include/scsi/scsi_device.h
> @@ -80,6 +80,7 @@ struct scsi_device {
>       spinlock_t list_lock;
>       struct list_head cmd_list;      /* queue of in use SCSI Command 
> structures */
>       struct list_head starved_entry;
> +     struct list_head eh_abort_list;
>       struct scsi_cmnd *current_cmnd; /* currently active command */
>       unsigned short queue_depth;     /* How deep of a queue we want */
>       unsigned short max_queue_depth; /* max queue depth */
> @@ -180,6 +181,7 @@ struct scsi_device {
>  
>       struct execute_work     ew; /* used to get process context on put */
>       struct work_struct      requeue_work;
> +     struct work_struct      abort_work;
>  
>       struct scsi_dh_data     *scsi_dh_data;
>       enum scsi_device_state sdev_state;
> -- 
> 1.7.12.4
> 

Jörn

--
You can't tell where a program is going to spend its time. Bottlenecks
occur in surprising places, so don't try to second guess and put in a
speed hack until you've proven that's where the bottleneck is.
-- Rob Pike
--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 3/4] scsi: improved eh timeout handler

Reply via email to