Hey Hannes,

This will not fix any hangs after the scsi eh or iscsi eh has fired, but 
I think this patch will help prevent the scsi eh from firing when we do 
not need it to like you have seen in some bugzillas. The patch was made 
over the my iscsi tree. It should also apply to scsi-rc-fixes with the 
patches I sent the other day.

I modified our command timedout handler so if a command has made some 
progress since the last timeout or if it is just getting started (it has 
been put on the wire but we have not yet got anything for it), then we 
will ask for some more time to run it.

This is helping here for these problems:
1. sending more IO than the disk/target can handle
2. using a shorter scsi cmd timeout with a slower link

I am going to combine this with those change_queue_depth patches if they 
are ok upstream, and in the end also add lpfc/qla2xxx's rampup/rampdown 
code to scsi-ml. So basically if we determine if we are sending too many 
IOs, then we can call some helper rampdown code to drop the queue depth 
for the user. If however it was a transient problem, the common ramp up 
code will detect it and increase it again.

I think with the combo rampup/rampdown and the modified 
iscsi_eh_cmd_timed_out in this patch, it should fix a lot of problems we 
see where the scsi-eh runs when it should not.

--~--~---------~--~----~------------~-------~--~----~
You received this message because you are subscribed to the Google Groups 
"open-iscsi" group.
To post to this group, send email to open-iscsi@googlegroups.com
To unsubscribe from this group, send email to 
open-iscsi+unsubscr...@googlegroups.com
For more options, visit this group at http://groups.google.com/group/open-iscsi
-~----------~----~----~----~------~----~------~--~---

>From 23acb13e1290eb0e0c08287fea1e396a17cd2167 Mon Sep 17 00:00:00 2001
From: Mike Christie <micha...@cs.wisc.edu>
Date: Mon, 18 May 2009 19:59:17 -0500
Subject: [PATCH 4/5] libiscsi: reset command timer if iscsi task is making progress

This patch has the iscsi eh cmd time out handler ask for more
time if the command has completed a pdu within the command timer.
It also makes sure that we check the transport and that the
transport checks do not accidentally reset the command timer
if the command really does need to be unjammed via the scsi eh.

Signed-off-by: Mike Christie <micha...@cs.wisc.edu>
---
 drivers/scsi/libiscsi.c     |   52 +++++++++++++++++++++++++++++++++---------
 drivers/scsi/libiscsi_tcp.c |    6 +++-
 include/scsi/libiscsi.h     |    4 +++
 3 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 59908ae..4cc3184 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -1361,6 +1361,9 @@ static inline struct iscsi_task *iscsi_alloc_task(struct iscsi_conn *conn,
 	task->state = ISCSI_TASK_PENDING;
 	task->conn = conn;
 	task->sc = sc;
+	task->have_checked_conn = 0;
+	task->last_timeout = jiffies;
+	task->last_recv = jiffies;
 	INIT_LIST_HEAD(&task->running);
 	return task;
 }
@@ -1716,17 +1719,18 @@ static int iscsi_has_ping_timed_out(struct iscsi_conn *conn)
 		return 0;
 }
 
-static enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *scmd)
+static enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
 {
+	enum blk_eh_timer_return rc = BLK_EH_NOT_HANDLED;
+	struct iscsi_task *task = NULL;
 	struct iscsi_cls_session *cls_session;
 	struct iscsi_session *session;
 	struct iscsi_conn *conn;
-	enum blk_eh_timer_return rc = BLK_EH_NOT_HANDLED;
 
-	cls_session = starget_to_session(scsi_target(scmd->device));
+	cls_session = starget_to_session(scsi_target(sc->device));
 	session = cls_session->dd_data;
 
-	ISCSI_DBG_SESSION(session, "scsi cmd %p timedout\n", scmd);
+	ISCSI_DBG_SESSION(session, "scsi cmd %p timedout\n", sc);
 
 	spin_lock(&session->lock);
 	if (session->state != ISCSI_STATE_LOGGED_IN) {
@@ -1745,6 +1749,23 @@ static enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *scmd)
 		goto done;
 	}
 
+	task = (struct iscsi_task *)sc->SCp.ptr;
+	if (!task)
+		goto done;
+	/*
+	 * If we have processed a PDU for the command since the last
+	 * timeout then ask for more time.
+	 */
+	if (time_after_eq(task->last_recv, task->last_timeout)) {
+		ISCSI_DBG_CONN(conn, "Command making progress. Asking "
+			       "scsi-ml for more time to complete. "
+			       "Last data recv at %lu. Last timeout was at "
+			       "%lu\n.", task->last_recv, task->last_timeout);
+		task->have_checked_conn = 0;
+		rc = BLK_EH_RESET_TIMER;
+		goto done;
+	}
+
 	if (!conn->recv_timeout && !conn->ping_timeout)
 		goto done;
 	/*
@@ -1755,20 +1776,29 @@ static enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *scmd)
 		rc = BLK_EH_RESET_TIMER;
 		goto done;
 	}
+
+	/* Assumes nop timeout is shorter than scsi cmd timeout */
+	if (task->have_checked_conn)
+		goto done;
+
 	/*
-	 * if we are about to check the transport then give the command
-	 * more time
+	 * Checking the transport already or nop from a cmd timeout still
+	 * running
 	 */
-	if (time_before_eq(conn->last_recv + (conn->recv_timeout * HZ),
-			   jiffies)) {
+	if (conn->ping_task) {
+		task->have_checked_conn = 1;
 		rc = BLK_EH_RESET_TIMER;
 		goto done;
 	}
 
-	/* if in the middle of checking the transport then give us more time */
-	if (conn->ping_task)
-		rc = BLK_EH_RESET_TIMER;
+	/* Make sure there is a transport check done */
+	iscsi_send_nopout(conn, NULL);
+	task->have_checked_conn = 1;
+	rc = BLK_EH_RESET_TIMER;
+
 done:
+	if (task)
+		task->last_timeout = jiffies;
 	spin_unlock(&session->lock);
 	ISCSI_DBG_SESSION(session, "return %s\n", rc == BLK_EH_RESET_TIMER ?
 			  "timer reset" : "nh");
diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c
index 2bc0709..0c3bd5c 100644
--- a/drivers/scsi/libiscsi_tcp.c
+++ b/drivers/scsi/libiscsi_tcp.c
@@ -686,6 +686,7 @@ iscsi_tcp_hdr_dissect(struct iscsi_conn *conn, struct iscsi_hdr *hdr)
 				     "offset=%d, datalen=%d)\n",
 				      tcp_task->data_offset,
 				      tcp_conn->in.datalen);
+			task->last_recv = jiffies;
 			rc = iscsi_segment_seek_sg(&tcp_conn->in.segment,
 						   sdb->table.sgl,
 						   sdb->table.nents,
@@ -713,9 +714,10 @@ iscsi_tcp_hdr_dissect(struct iscsi_conn *conn, struct iscsi_hdr *hdr)
 			rc = ISCSI_ERR_BAD_ITT;
 		else if (ahslen)
 			rc = ISCSI_ERR_AHSLEN;
-		else if (task->sc->sc_data_direction == DMA_TO_DEVICE)
+		else if (task->sc->sc_data_direction == DMA_TO_DEVICE) {
+			task->last_recv = jiffies;
 			rc = iscsi_tcp_r2t_rsp(conn, task);
-		else
+		} else
 			rc = ISCSI_ERR_PROTO;
 		spin_unlock(&conn->session->lock);
 		break;
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index 196525c..714c34a 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -125,6 +125,10 @@ struct iscsi_task {
 	struct scsi_cmnd	*sc;		/* associated SCSI cmd*/
 	struct iscsi_conn	*conn;		/* used connection    */
 
+	/* data processing tracking */
+	unsigned long		last_recv;
+	unsigned long		last_timeout;
+	int			have_checked_conn;
 	/* state set/tested under session->lock */
 	int			state;
 	atomic_t		refcount;
-- 
1.6.0.6

Reply via email to