Roland Dreier wrote:
I'm testing the patch below, which I think does the right thing for
aborts and device resets.  What do you think?


It still does not address the issue pointed out from my previous email - the first eh_host_reset_handler() success, right away scsi_eh_host_reset() send start-stop-unit or test-unit-ready command using the same scsi command. This stu or tur command stuck in our queue, get timeout and get aborted. The abortion of stu or tur command once again get timeout. The original scsi command get freed. We delay the clean-up of the associated request in eh_device_reset_handler() instead of in eh_abort_handler() so it's still in our queue. The lun is marked offline. The next eh_device_reset_handler() for the same lun won't be called. The next eh_reset_host_handler() will hit used-after-free bug.
You can see the log below


May  5 16:36:22 lab105 kernel: ib_srp: failed receive status 5
May  5 16:36:24 lab105 kernel: ib_srp: connection closed
May 5 16:36:24 lab105 kernel: ib_mthca 0000:05:00.0: CQ overrun on CQN 040082
May  5 16:36:24 lab105 kernel: ib_srp: QP event 1
May  5 16:36:24 lab105 last message repeated 2 times
May  5 16:36:54 lab105 kernel: SRP abort called
May  5 16:36:59 lab105 kernel: SRP reset_device called
May  5 16:37:04 lab105 kernel: ib_srp: SRP reset_host called
May  5 16:37:06 lab105 kernel: ib_srp: connection closed
May  5 16:37:16 lab105 kernel: ib_srp: QP event 1
May  5 16:37:16 lab105 last message repeated 3 times
May  5 16:37:26 lab105 kernel: SRP abort called
May  5 16:37:26 lab105 kernel: ib_srp: QP event 1
May 5 16:37:31 lab105 kernel: sd 6:0:0:1: scsi: Device offlined - not ready after error recovery
May  5 16:37:31 lab105 kernel: sd 6:0:0:1: rejecting I/O to offline device
May 5 16:37:31 lab105 kernel: Buffer I/O error on device sdd, logical block 0 May 5 16:37:31 lab105 kernel: Buffer I/O error on device sdd, logical block 1
May  5 16:37:31 lab105 kernel: sd 6:0:0:1: rejecting I/O to offline device
May 5 16:37:31 lab105 kernel: Buffer I/O error on device sdd, logical block 0
May  5 16:37:31 lab105 kernel: ib_srp: QP event 1
May  5 16:37:31 lab105 kernel: ib_srp: QP event 1
May  5 16:38:01 lab105 kernel: SRP abort called
May  5 16:38:06 lab105 kernel: SRP reset_device called
May  5 16:38:11 lab105 kernel: ib_srp: SRP reset_host called
May  5 16:38:13 scsi_eh_6[27704]: Oops 11012296146944 [1]
Modules linked in: ib_srp ib_cm ib_sa ib_mthca ib_mad ib_core nls_utf8 evdev joydev sg st sr_mod ide_cd cdrom usbserial parport_pc lp parport thermal processor ipv6 fan button d

Pid: 27704, CPU 1, comm:            scsi_eh_6
psr : 00001210081a6018 ifs : 800000000000058e ip : [<a000000202275671>] Not tainted
lab105 kernel: iip is at srp_reconnect_target+0x2b1/0x5e0 [ib_srp]
unat: 0000000000000000 pfs : 000000000000058e rsc : 0000000000000003
rnat: 0000000000000000 bsps: 0000000000000000 pr  : 0000000000009141
ldrs: 0000000000000000 ccv : 0000000000000000 fpsr: 0009804c8a70433f
csd : 0000000000000000 ssd : 0000000000000000
b0  : a0000002022755e0 b6  : a00000010000faa0 b7  : a000000202215ac0
f6  : 1003e6b6b6b6b6b6b6b6b f7  : 0ffdd8000000000000000
f8  : 1003e00000000000014c8 f9  : 1003e0000000000000013
f10 : 1003e0000000000000000 f11 : 1003e0000000000000000
r1  : a0000002022782f8 r2  : e0000001f0d6bb48 r3  : e0000001f0d6b9e8
r8  : 0000000000000000 r9  : a00000010090d8f0 r10 : a00000010090d8f8
r11 : 0000000000000001 r12 : e0000001b03d7d00 r13 : e0000001b03d0000
r14 : a00000010090d900 r15 : e0000001b03d0000 r16 : 0000000000000001
r17 : 0000000000000001 r18 : e0000001b03d0fa4 r19 : a00000010090d908
r20 : ffffffffffffffff r21 : 0000000000000008 r22 : e0000000752c4400
r23 : e000000060ee5688 r24 : 0000000000000080 r25 : e0000000752c441f
r26 : a000000202215ac0 r27 : e00000018a40e1e0 r28 : e00000018a40e000
r29 : e000000060ee55e8 r30 : e0000001b1fd8ac0 r31 : e0000001b1fd8a28

Call Trace:
b_srp: connectio [<a000000100013720>] show_stack+0x80/0xa0
                                sp=e0000001b03d7880 bsp=e0000001b03d1330
n  closed
[<a000000100013f80>] show_regs+0x840/0x880
                                sp=e0000001b03d7a50 bsp=e0000001b03d12d0
 [<a000000100037c30>] die+0x1b0/0x2e0
                                sp=e0000001b03d7a60 bsp=e0000001b03d1288
 [<a00000010005b150>] ia64_do_page_fault+0x970/0xae0
                                sp=e0000001b03d7a80 bsp=e0000001b03d1220
 [<a00000010000be80>] ia64_leave_kernel+0x0/0x280
                                sp=e0000001b03d7b30 bsp=e0000001b03d1220
 [<a000000202275670>] srp_reconnect_target+0x2b0/0x5e0 [ib_srp]
                                sp=e0000001b03d7d00 bsp=e0000001b03d11a8
 [<a000000202275a00>] srp_reset_host+0x60/0xa0 [ib_srp]
                                sp=e0000001b03d7dc0 bsp=e0000001b03d1180
 [<a000000201b27370>] scsi_try_host_reset+0xd0/0x240 [scsi_mod]
                                sp=e0000001b03d7dc0 bsp=e0000001b03d1150
 [<a000000201b29ab0>] scsi_error_handler+0x17f0/0x2220 [scsi_mod]
                                sp=e0000001b03d7dc0 bsp=e0000001b03d1068
 [<a0000001000bdb00>] kthread+0x220/0x280
                                sp=e0000001b03d7e10 bsp=e0000001b03d1028
 [<a000000100011ae0>] kernel_thread_helper+0xe0/0x100
                                sp=e0000001b03d7e30 bsp=e0000001b03d1000
 [<a000000100009140>] start_kernel_thread+0x20/0x40
                                sp=e0000001b03d7e30 bsp=e0000001b03d1000
May 5 16:38:14 lab105 kernel: Unable to handle kernel paging request at virtual address 6b6b6b6b6b6b6b6b
May  5 16:38:14 lab105 kernel: scsi_eh_6[27704]: Oops 11012296146944 [1]
May 5 16:38:14 lab105 kernel: Modules linked in: ib_srp ib_cm ib_sa ib_mthca ib_mad ib_core nls_utf8 evdev joydev sg st sr_mod ide_cd cdrom usbserial parport_pc lp parport thed
May  5 16:38:14 lab105 kernel:
May  5 16:38:14 lab105 kernel: Pid: 27704, CPU 1, comm:            scsi_eh_6
May 5 16:38:14 lab105 kernel: psr : 00001210081a6018 ifs : 800000000000058e ip : [<a000000202275671>] Not tainted May 5 16:38:14 lab105 kernel: ip is at srp_reconnect_target+0x2b1/0x5e0 [ib_srp] May 5 16:38:14 lab105 kernel: unat: 0000000000000000 pfs : 000000000000058e rsc : 0000000000000003 May 5 16:38:14 lab105 kernel: rnat: 0000000000000000 bsps: 0000000000000000 pr : 0000000000009141 May 5 16:38:14 lab105 kernel: ldrs: 0000000000000000 ccv : 0000000000000000 fpsr: 0009804c8a70433f
May  5 16:38:14 lab105 kernel: csd : 0000000000000000 ssd : 0000000000000000
May 5 16:38:14 lab105 kernel: b0 : a0000002022755e0 b6 : a00000010000faa0 b7 : a000000202215ac0 May 5 16:38:14 lab105 kernel: f6 : 1003e6b6b6b6b6b6b6b6b f7 : 0ffdd8000000000000000 May 5 16:38:14 lab105 kernel: f8 : 1003e00000000000014c8 f9 : 1003e0000000000000013 May 5 16:38:14 lab105 kernel: f10 : 1003e0000000000000000 f11 : 1003e0000000000000000 May 5 16:38:14 lab105 kernel: r1 : a0000002022782f8 r2 : e0000001f0d6bb48 r3 : e0000001f0d6b9e8 May 5 16:38:14 lab105 kernel: r8 : 0000000000000000 r9 : a00000010090d8f0 r10 : a00000010090d8f8 May 5 16:38:14 lab105 kernel: r11 : 0000000000000001 r12 : e0000001b03d7d00 r13 : e0000001b03d0000 May 5 16:38:14 lab105 kernel: r14 : a00000010090d900 r15 : e0000001b03d0000 r16 : 0000000000000001 May 5 16:38:14 lab105 kernel: r17 : 0000000000000001 r18 : e0000001b03d0fa4 r19 : a00000010090d908 May 5 16:38:14 lab105 kernel: r20 : ffffffffffffffff r21 : 0000000000000008 r22 : e0000000752c4400 May 5 16:38:14 lab105 kernel: r23 : e000000060ee5688 r24 : 0000000000000080 r25 : e0000000752c441f May 5 16:38:14 lab105 kernel: r26 : a000000202215ac0 r27 : e00000018a40e1e0 r28 : e00000018a40e000 May 5 16:38:14 lab105 kernel: r29 : e000000060ee55e8 r30 : e0000001b1fd8ac0 r31 : e0000001b1fd8a28
May  5 16:38:14 lab105 kernel:
May  5 16:38:14 lab105 kernel: Call Trace:
May  5 16:38:14 lab105 kernel:  [<a000000100013720>] show_stack+0x80/0xa0
May 5 16:38:14 lab105 kernel: sp=e0000001b03d7880 bsp=e0000001b03d1330
May  5 16:38:14 lab105 kernel:  [<a000000100013f80>] show_regs+0x840/0x880
May 5 16:38:14 lab105 kernel: sp=e0000001b03d7a50 bsp=e0000001b03d12d0
May  5 16:38:14 lab105 kernel:  [<a000000100037c30>] die+0x1b0/0x2e0
May 5 16:38:14 lab105 kernel: sp=e0000001b03d7a60 bsp=e0000001b03d1288 May 5 16:38:14 lab105 kernel: [<a00000010005b150>] ia64_do_page_fault+0x970/0xae0 May 5 16:38:14 lab105 kernel: sp=e0000001b03d7a80 bsp=e0000001b03d1220 May 5 16:38:14 lab105 kernel: [<a00000010000be80>] ia64_leave_kernel+0x0/0x280 May 5 16:38:14 lab105 kernel: sp=e0000001b03d7b30 bsp=e0000001b03d1220 May 5 16:38:14 lab105 kernel: [<a000000202275670>] srp_reconnect_target+0x2b0/0x5e0 [ib_srp] May 5 16:38:14 lab105 kernel: sp=e0000001b03d7d00 bsp=e0000001b03d11a8 May 5 16:38:14 lab105 kernel: [<a000000202275a00>] srp_reset_host+0x60/0xa0 [ib_srp] May 5 16:38:14 lab105 kernel: sp=e0000001b03d7dc0 bsp=e0000001b03d1180 May 5 16:38:14 lab105 kernel: [<a000000201b27370>] scsi_try_host_reset+0xd0/0x240 [scsi_mod] May 5 16:38:14 lab105 kernel: sp=e0000001b03d7dc0 bsp=e0000001b03d1150 May 5 16:38:14 lab105 kernel: [<a000000201b29ab0>] scsi_error_handler+0x17f0/0x2220 [scsi_mod] May 5 16:38:14 lab105 kernel: sp=e0000001b03d7dc0 bsp=e0000001b03d1068
May  5 16:38:14 lab105 kernel:  [<a0000001000bdb00>] kthread+0x220/0x280
May 5 16:38:14 lab105 kernel: sp=e0000001b03d7e10 bsp=e0000001b03d1028 May 5 16:38:14 lab105 kernel: [<a000000100011ae0>] kernel_thread_helper+0xe0/0x100 May 5 16:38:14 lab105 kernel: sp=e0000001b03d7e30 bsp=e0000001b03d1000 May 5 16:38:14 lab105 kernel: [<a000000100009140>] start_kernel_thread+0x20/0x40 May 5 16:38:14 lab105 kernel: sp=e0000001b03d7e30 bsp=e0000001b03d1000


This patch apply on top of your patch will fix the problem.

    IB/srp: Fix tracking of pending requests during error handling

    Signed-off-by: Vu Pham <[EMAIL PROTECTED]>


diff -Naur infiniband/ulp/srp.roland-eh/ib_srp.c infiniband/ulp/srp/ib_srp.c
--- infiniband/ulp/srp.roland-eh/ib_srp.c	2006-05-05 16:59:35.000000000 -0700
+++ infiniband/ulp/srp/ib_srp.c	2006-05-05 17:21:31.000000000 -0700
@@ -1204,18 +1204,18 @@
 	if (srp_find_req(target, scmnd, &req))
 		return FAILED;
 	if (srp_send_tsk_mgmt(target, req, SRP_TSK_ABORT_TASK))
-		return FAILED;
+		ret = FAILED;
 
 	spin_lock_irq(target->scsi_host->host_lock);
 
-	if (req->cmd_done) {
-		srp_remove_req(target, req);
+	srp_remove_req(target, req);
+
+	if (req->cmd_done)
 		scmnd->scsi_done(scmnd);
-	} else if (!req->tsk_status) {
-		srp_remove_req(target, req);
+	else if (!req->tsk_status)
 		scmnd->result = DID_ABORT << 16;
-	} else
-		ret = FAILED;
+	else
+		scmnd->result = DID_RESET << 16;
 
 	spin_unlock_irq(target->scsi_host->host_lock);
 
@@ -1225,29 +1225,17 @@
 static int srp_reset_device(struct scsi_cmnd *scmnd)
 {
 	struct srp_target_port *target = host_to_target(scmnd->device->host);
-	struct srp_request *req, *tmp;
+	struct srp_request *req;
+	int ret = SUCCESS;
 
 	printk(KERN_ERR "SRP reset_device called\n");
 
-	if (srp_find_req(target, scmnd, &req))
-		return FAILED;
-	if (srp_send_tsk_mgmt(target, req, SRP_TSK_LUN_RESET))
-		return FAILED;
-	if (req->tsk_status)
-		return FAILED;
-
-	spin_lock_irq(target->scsi_host->host_lock);
-
-	list_for_each_entry_safe(req, tmp, &target->req_queue, list)
-		if (req->scmnd->device == scmnd->device) {
-			req->scmnd->result = DID_RESET << 16;
-			scmnd->scsi_done(scmnd);
-			srp_remove_req(target, req);
-		}
-
-	spin_unlock_irq(target->scsi_host->host_lock);
+	if ((srp_find_req(target, scmnd, &req)) ||
+	    (srp_send_tsk_mgmt(target, req, SRP_TSK_LUN_RESET)) ||
+	    (req->tsk_status))
+		ret = FAILED;
 
-	return SUCCESS;
+	return ret;
 }
 
 static int srp_reset_host(struct scsi_cmnd *scmnd)
_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to