Mike Christie wrote:
Dev, Vasu wrote:
-----Original Message-----
From: Mike Christie [mailto:[EMAIL PROTECTED]
So any idea why system locks up with your 14 patches?
Weird with the 14 patches it works almost perfect for me, but now it
does not work for you guys :)
It was not system lock-up instead my ssh session hanged after your these
14 patches with error test. I mostly work in ssh session and I thought
my system hanged without any additional log prints in my netconsole window.
So I went ahead with power re-cycling system assuming system locked up.
This time I observed console was still alive for sometime after error test
and I could do some command. However new ssh request failed with these
error persistently.
I found a lock up with the handle-vasu-comments.patch patch. Without the
patch we could call the resp function twice and bad things could happen,
Without the handle-vasu-comments.patch patch, if fc_fcp.c sends an
abort, fc_seq_exch_abort sets the timer on the ep. If we get a abts
response at the same time fc_exch_timeout is starting to run,
fc_exch_abts_resp could get the lock first, do its work then drop the
ex_lock and call resp. Then fc_exch_timeout could grab the lock and call
the resp function (I guess fc_fcp_error could get FC_EX_TIMEOUT because
the abort code had set the timer on the ep and fc_exch_timeout was
returning FC_EX_TIMEOUT on the abort). If the first call freed the fsp
and ep, then when the second call to resp grabbed the scsi pkt lock who
knows what could happen.
With the handle-vasu-comments.patch I forgot to drop the ex_lock when I
exited fc_exch_timeout, so it will lock up the box if you are unlucky.
If you are lucky I think we can drubdge on for a while depending on what
else timesout and the processor setup and what else happens.
I rolled up the changes in the last two patches
handle-vasu-comments.patch
clear-complete-during-reset.patch
and fixed the issue above in handle-vasu-comments.patch in the attached
patch.
This patch should apply over the original 14 patches with some offsets.
I made the patch over some other patches, but the offsets should be ok.
diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
index 207e849..f3afef3 100644
--- a/drivers/scsi/libfc/fc_exch.c
+++ b/drivers/scsi/libfc/fc_exch.c
@@ -394,7 +394,8 @@ static void fc_exch_timeout(unsigned long ep_arg)
spin_unlock_bh(&ep->ex_lock);
if (e_stat & ESB_ST_REC_QUAL)
fc_exch_rrq(ep);
- } else {
+ goto done;
+ } else if (!(e_stat & ESB_ST_ABNORMAL)) {
resp = ep->resp;
arg = ep->resp_arg;
/*
@@ -407,8 +408,10 @@ static void fc_exch_timeout(unsigned long ep_arg)
if (resp)
resp(sp, ERR_PTR(-FC_EX_TIMEOUT), arg);
fc_seq_exch_abort(sp);
+ goto done;
}
-
+ spin_unlock_bh(&ep->ex_lock);
+done:
/*
* This release matches the hold taken when the timer was set.
*/
@@ -1272,12 +1275,16 @@ static void fc_exch_abts_resp(struct fc_exch *ep,
struct fc_frame *fp)
struct fc_seq *sp;
u16 low;
u16 high;
- int rc = 1;
+ int rc = 1, has_rec = 0;
fh = fc_frame_header_get(fp);
if (fc_exch_debug)
FC_DBG("exch: BLS rctl %x - %s\n",
fh->fh_r_ctl, fc_exch_rctl_name(fh->fh_r_ctl));
+
+ if (del_timer_sync(&ep->ex_timer))
+ fc_exch_release(ep); /* release from pending timer hold */
+
spin_lock_bh(&ep->ex_lock);
switch (fh->fh_r_ctl) {
case FC_RCTL_BA_ACC:
@@ -1297,7 +1304,7 @@ static void fc_exch_abts_resp(struct fc_exch *ep, struct
fc_frame *fp)
ap->ba_seq_id == ep->seq_id) && low != high) {
ep->esb_stat |= ESB_ST_REC_QUAL;
fc_exch_hold(ep); /* hold for recovery qualifier */
- fc_exch_timer_set_locked(ep, ep->r_a_tov);
+ has_rec = 1;
}
break;
case FC_RCTL_BA_RJT:
@@ -1325,6 +1332,8 @@ static void fc_exch_abts_resp(struct fc_exch *ep, struct
fc_frame *fp)
if (resp)
resp(sp, fp, ex_resp_arg);
+ if (has_rec)
+ fc_exch_timer_set(ep, ep->r_a_tov);
fc_frame_free(fp);
}
@@ -1453,7 +1462,7 @@ static void fc_exch_reset(struct fc_exch *ep)
ep->resp = NULL;
if (ep->esb_stat & ESB_ST_REC_QUAL)
atomic_dec(&ep->ex_refcnt); /* drop hold for rec_qual */
- ep->esb_stat &= ~ESB_ST_REC_QUAL;
+ ep->esb_stat &= ~(ESB_ST_REC_QUAL | ESB_ST_COMPLETE);
arg = ep->resp_arg;
if (del_timer(&ep->ex_timer))
atomic_dec(&ep->ex_refcnt); /* drop hold for timer */
@@ -1625,7 +1634,7 @@ static void fc_exch_rrq_resp(struct fc_seq *sp, struct
fc_frame *fp, void *arg)
switch (op) {
case ELS_LS_RJT:
FC_DBG("LS_RJT for RRQ");
- break;
+ /* fall through */
case ELS_LS_ACC:
fc_exch_done(&aborted_ep->seq);
fc_exch_release(aborted_ep); /* drop hold for rec qual */
diff --git a/drivers/scsi/libfc/fc_fcp.c b/drivers/scsi/libfc/fc_fcp.c
index a050dd4..f2915ed 100644
--- a/drivers/scsi/libfc/fc_fcp.c
+++ b/drivers/scsi/libfc/fc_fcp.c
@@ -1058,16 +1058,9 @@ static void fc_fcp_error(struct fc_fcp_pkt *fsp, struct
fc_frame *fp)
case -FC_EX_CLOSED:
fc_fcp_retry_cmd(fsp);
goto unlock;
- case -FC_EX_TIMEOUT:
- /*
- * exch layer decided to abort exchange -
- * will wait for response
- */
- fsp->state |= FC_SRB_ABORT_PENDING;
- goto unlock;
+ default:
+ FC_DBG("unknown error %ld\n", PTR_ERR(fp));
}
-
- FC_DBG("unknown error %ld\n", PTR_ERR(fp));
/*
* clear abort pending, because the lower layer
* decided to force completion.
_______________________________________________
devel mailing list
[email protected]
http://www.open-fcoe.org/mailman/listinfo/devel