On Tue, Oct 13, 2009 at 12:57 AM, Vu Pham <[email protected]> wrote:
>
>
> Introducing srp_dev_loss_tmo module parameter. Creating a timer to clean up
> connection after srp_dev_loss_tmo expired. During srp_dev_loss_tmo, the qp
> is in error state, srp will return DID_RESET for outstanding I/O and return
> FAILED for abort_cmd, reset_lun, and return SUCCESS (without trying
> reconnect) on reset_host.
>
> Signed-off-by: Vu Pham <[email protected]>
>
>
>
> Index: ofed_kernel/drivers/infiniband/ulp/srp/ib_srp.c
> ===================================================================
> --- ofed_kernel.orig/drivers/infiniband/ulp/srp/ib_srp.c
> +++ ofed_kernel/drivers/infiniband/ulp/srp/ib_srp.c
> @@ -78,6 +77,13 @@
> MODULE_PARM_DESC(mellanox_workarounds,
> "Enable workarounds for Mellanox SRP target bugs if != 0");
>
> +static int srp_dev_loss_tmo = 60;
> +
> +module_param(srp_dev_loss_tmo, int, 0444);
> +MODULE_PARM_DESC(srp_dev_loss_tmo,
> + "Default number of seconds that srp transport should \
> + insulate the lost of a remote port (default is 60 secs");
> +
> static void srp_add_one(struct ib_device *device);
> static void srp_remove_one(struct ib_device *device);
> static void srp_completion(struct ib_cq *cq, void *target_ptr);
> @@ -898,6 +926,48 @@
> DMA_FROM_DEVICE);
> }
>
> +static void srp_reconnect_work(struct work_struct *work)
> +{
> + struct srp_target_port *target =
> + container_of(work, struct srp_target_port, work);
> +
> + srp_reconnect_target(target);
> + target->work_in_progress = 0;
> +}
> +
> +static void srp_qp_in_err_timer(unsigned long data)
> +{
> + struct srp_target_port *target = (struct srp_target_port *)data;
> + struct srp_request *req, *tmp;
> +
> + if (target->state != SRP_TARGET_LIVE)
> + return;
> +
> + spin_lock_irq(target->scsi_host->host_lock);
> + list_for_each_entry_safe(req, tmp, &target->req_queue, list)
> + srp_reset_req(target, req);
> + spin_unlock_irq(target->scsi_host->host_lock);
> +
> + spin_lock_irq(target->scsi_host->host_lock);
> + if (!target->work_in_progress) {
> + target->work_in_progress = 1;
> + INIT_WORK(&target->work, srp_reconnect_work);
> + schedule_work(&target->work);
> + }
> + spin_unlock_irq(target->scsi_host->host_lock);
> +}
> +
> +static void srp_qp_err_add_timer(struct srp_target_port *target, int time)
> +{
> + if (!timer_pending(&target->qp_err_timer)) {
> + setup_timer(&target->qp_err_timer,
> + srp_qp_in_err_timer,
> + (unsigned long)target);
> + target->qp_err_timer.expires = time * HZ + jiffies;
> + add_timer(&target->qp_err_timer);
> + }
> +}
What will happen when the ib_srp kernel module is removed after the
timer has been set up but before the timer has fired ? Isn't a call to
del_timer_sync() missing in srp_remove_work() ?
> +
> static void srp_completion(struct ib_cq *cq, void *target_ptr)
> {
> struct srp_target_port *target = target_ptr;
> @@ -960,11 +980,20 @@
> ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
> while (ib_poll_cq(cq, 1, &wc) > 0) {
> if (wc.status) {
> + unsigned long flags;
> +
> shost_printk(KERN_ERR, target->scsi_host,
> PFX "failed %s status %d\n",
> wc.wr_id & SRP_OP_RECV ? "receive" :
> "send",
> wc.status);
> - target->qp_in_error = 1;
> + spin_lock_irqsave(target->scsi_host->host_lock,
> flags);
> + if (!target->qp_in_error &&
> + target->state == SRP_TARGET_LIVE) {
> + target->qp_in_error = 1;
> + srp_qp_err_add_timer(target,
> + srp_dev_loss_tmo - 55);
> + }
> + spin_unlock_irqrestore(target->scsi_host->host_lock,
> flags);
> break;
> }
>
> @@ -1274,5 +1299,6 @@
> int attr_mask = 0;
> int comp = 0;
> int opcode = 0;
> + unsigned long flags;
>
> switch (event->event) {
> @@ -1301,6 +1381,14 @@
> shost_printk(KERN_ERR, target->scsi_host,
> PFX "connection closed\n");
>
> + spin_lock_irqsave(target->scsi_host->host_lock, flags);
> + if (!target->qp_in_error &&
> + target->state == SRP_TARGET_LIVE) {
> + target->qp_in_error = 1;
> + srp_qp_err_add_timer(target,
> + srp_dev_loss_tmo - 55);
> + }
> + spin_unlock_irqrestore(target->scsi_host->host_lock, flags);
> target->status = 0;
> break;
>
> @@ -1443,9 +1529,22 @@
> static int srp_reset_host(struct scsi_cmnd *scmnd)
> {
> struct srp_target_port *target = host_to_target(scmnd->device->host);
> + struct srp_request *req, *tmp;
> int ret = FAILED;
>
> - shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host
> called\n");
> + shost_printk(KERN_ERR, target->scsi_host,
> + PFX "SRP reset_host called state %d qp_err %d\n",
> + target->state, target->qp_in_error);
> +
> + spin_lock_irq(target->scsi_host->host_lock);
> + if (timer_pending(&target->qp_err_timer) || target->qp_in_error ||
> + target->state != SRP_TARGET_LIVE) {
> + list_for_each_entry_safe(req, tmp, &target->req_queue, list)
> + srp_reset_req(target, req);
> + spin_unlock_irq(target->scsi_host->host_lock);
> + return SUCCESS;
> + }
> + spin_unlock_irq(target->scsi_host->host_lock);
>
> if (!srp_reconnect_target(target))
> ret = SUCCESS;
> @@ -2150,6 +2342,9 @@
> sizeof (struct srp_indirect_buf) +
> srp_sg_tablesize * 16);
>
> + if (srp_dev_loss_tmo < 60)
> + srp_dev_loss_tmo = 60;
> +
> ret = class_register(&srp_class);
> if (ret) {
> printk(KERN_ERR PFX "couldn't register class
> infiniband_srp\n");
> Index: ofed_kernel/drivers/infiniband/ulp/srp/ib_srp.h
> ===================================================================
> --- ofed_kernel.orig/drivers/infiniband/ulp/srp/ib_srp.h
> +++ ofed_kernel/drivers/infiniband/ulp/srp/ib_srp.h
> @@ -153,12 +159,14 @@
> struct srp_request req_ring[SRP_SQ_SIZE];
>
> struct work_struct work;
> + int work_in_progress;
>
> struct list_head list;
> struct completion done;
> int status;
> enum srp_target_state state;
> int qp_in_error;
> + struct timer_list qp_err_timer;
> };
>
> struct srp_iu {
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html