After a transport layer occurred, periodically try to reconnect
to the target until the dev_loss timer expires.  Protect the
callback functions that can be invoked from inside the SCSI EH
against concurrent invocation with srp_reconnect_rport() via the
rport mutex. Change the default dev_loss_tmo from 60s into 600s
to give the reconnect mechanism a chance to kick in.

Signed-off-by: Bart Van Assche <[email protected]>
Acked-by: David Dillow <[email protected]>
Cc: Roland Dreier <[email protected]>
Cc: Vu Pham <[email protected]>
Cc: Sebastian Riemer <[email protected]>
---
 Documentation/ABI/stable/sysfs-transport-srp |  8 +++++
 drivers/infiniband/ulp/srp/ib_srp.c          | 52 ++++++++++++++++++++++++----
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/Documentation/ABI/stable/sysfs-transport-srp 
b/Documentation/ABI/stable/sysfs-transport-srp
index 178a44d..21bd480 100644
--- a/Documentation/ABI/stable/sysfs-transport-srp
+++ b/Documentation/ABI/stable/sysfs-transport-srp
@@ -30,6 +30,14 @@ Contact:     [email protected]
 Description:   16-byte local SRP port identifier in hexadecimal format. An
                example: 4c:49:4e:55:58:20:56:49:4f:00:00:00:00:00:00:00.
 
+What:          /sys/class/srp_remote_ports/port-<h>:<n>/reconnect_delay
+Date:          December 1, 2013
+KernelVersion: 3.12
+Contact:       [email protected], [email protected]
+Description:   Number of seconds the SCSI layer will wait after a reconnect
+               attempt failed before retrying. Setting this attribute to
+               "off" will disable time-based reconnecting.
+
 What:          /sys/class/srp_remote_ports/port-<h>:<n>/roles
 Date:          June 27, 2007
 KernelVersion: 2.6.24
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c 
b/drivers/infiniband/ulp/srp/ib_srp.c
index ceb84b6..4be3eb8 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -88,6 +88,11 @@ MODULE_PARM_DESC(topspin_workarounds,
 
 static struct kernel_param_ops srp_tmo_ops;
 
+static int srp_reconnect_delay = 10;
+module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay,
+               S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reconnect_delay, "Time between successive reconnect 
attempts");
+
 static int srp_fast_io_fail_tmo = 15;
 module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo,
                S_IRUGO | S_IWUSR);
@@ -96,7 +101,7 @@ MODULE_PARM_DESC(fast_io_fail_tmo,
                 " layer error and failing all I/O. \"off\" means that this"
                 " functionality is disabled.");
 
-static int srp_dev_loss_tmo = 60;
+static int srp_dev_loss_tmo = 600;
 module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo,
                S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(dev_loss_tmo,
@@ -144,10 +149,14 @@ static int srp_tmo_set(const char *val, const struct 
kernel_param *kp)
        } else {
                tmo = -1;
        }
-       if (kp->arg == &srp_fast_io_fail_tmo)
-               res = srp_tmo_valid(tmo, srp_dev_loss_tmo);
+       if (kp->arg == &srp_reconnect_delay)
+               res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo,
+                                   srp_dev_loss_tmo);
+       else if (kp->arg == &srp_fast_io_fail_tmo)
+               res = srp_tmo_valid(srp_reconnect_delay, tmo, srp_dev_loss_tmo);
        else
-               res = srp_tmo_valid(srp_fast_io_fail_tmo, tmo);
+               res = srp_tmo_valid(srp_reconnect_delay, srp_fast_io_fail_tmo,
+                                   tmo);
        if (res)
                goto out;
        *(int *)kp->arg = tmo;
@@ -1426,18 +1435,29 @@ static void srp_send_completion(struct ib_cq *cq, void 
*target_ptr)
 static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
 {
        struct srp_target_port *target = host_to_target(shost);
+       struct srp_rport *rport = target->rport;
        struct srp_request *req;
        struct srp_iu *iu;
        struct srp_cmd *cmd;
        struct ib_device *dev;
        unsigned long flags;
        int len, result;
+       const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler;
+
+       /*
+        * The SCSI EH thread is the only context from which srp_queuecommand()
+        * can get invoked for blocked devices (SDEV_BLOCK /
+        * SDEV_CREATED_BLOCK). Avoid racing with srp_reconnect_rport() by
+        * locking the rport mutex if invoked from inside the SCSI EH.
+        */
+       if (in_scsi_eh)
+               mutex_lock(&rport->mutex);
 
        result = srp_chkready(target->rport);
        if (unlikely(result)) {
                scmnd->result = result;
                scmnd->scsi_done(scmnd);
-               return 0;
+               goto unlock_rport;
        }
 
        spin_lock_irqsave(&target->lock, flags);
@@ -1482,6 +1502,10 @@ static int srp_queuecommand(struct Scsi_Host *shost, 
struct scsi_cmnd *scmnd)
                goto err_unmap;
        }
 
+unlock_rport:
+       if (in_scsi_eh)
+               mutex_unlock(&rport->mutex);
+
        return 0;
 
 err_unmap:
@@ -1496,6 +1520,9 @@ err_iu:
 err_unlock:
        spin_unlock_irqrestore(&target->lock, flags);
 
+       if (in_scsi_eh)
+               mutex_unlock(&rport->mutex);
+
        return SCSI_MLQUEUE_HOST_BUSY;
 }
 
@@ -1780,6 +1807,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct 
ib_cm_event *event)
 static int srp_send_tsk_mgmt(struct srp_target_port *target,
                             u64 req_tag, unsigned int lun, u8 func)
 {
+       struct srp_rport *rport = target->rport;
        struct ib_device *dev = target->srp_host->srp_dev->dev;
        struct srp_iu *iu;
        struct srp_tsk_mgmt *tsk_mgmt;
@@ -1789,12 +1817,20 @@ static int srp_send_tsk_mgmt(struct srp_target_port 
*target,
 
        init_completion(&target->tsk_mgmt_done);
 
+       /*
+        * Lock the rport mutex to avoid that srp_create_target_ib() is
+        * invoked while a task management function is being sent.
+        */
+       mutex_lock(&rport->mutex);
        spin_lock_irq(&target->lock);
        iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT);
        spin_unlock_irq(&target->lock);
 
-       if (!iu)
+       if (!iu) {
+               mutex_unlock(&rport->mutex);
+
                return -1;
+       }
 
        ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt,
                                   DMA_TO_DEVICE);
@@ -1811,8 +1847,11 @@ static int srp_send_tsk_mgmt(struct srp_target_port 
*target,
                                      DMA_TO_DEVICE);
        if (srp_post_send(target, iu, sizeof *tsk_mgmt)) {
                srp_put_tx_iu(target, iu, SRP_IU_TSK_MGMT);
+               mutex_unlock(&rport->mutex);
+
                return -1;
        }
+       mutex_unlock(&rport->mutex);
 
        if (!wait_for_completion_timeout(&target->tsk_mgmt_done,
                                         
msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS)))
@@ -2713,6 +2752,7 @@ static void srp_remove_one(struct ib_device *device)
 static struct srp_function_template ib_srp_transport_functions = {
        .has_rport_state         = true,
        .reset_timer_if_blocked  = true,
+       .reconnect_delay         = &srp_reconnect_delay,
        .fast_io_fail_tmo        = &srp_fast_io_fail_tmo,
        .dev_loss_tmo            = &srp_dev_loss_tmo,
        .reconnect               = srp_rport_reconnect,
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to