Module Name:    src
Committed By:   buhrow
Date:           Tue Apr  1 23:57:54 UTC 2014

Modified Files:
        src/sys/dev/ic: mpt_netbsd.c mpt_netbsd.h

Log Message:
Checking in changes to improve error handling.  Specifically:

- if commands timeout, clear the queues to the the card and perform a soft
reset on the LSI hardware since when these timeouts occur, the LSI firmware
is not graceful about recovering at all.

- Recover gracefully from more kinds of errors using the same recovery
mechanism listed above.

Also, implement mpt_ioctl() to handle bus reset requests from scsictl(8).


To generate a diff of this commit:
cvs rdiff -u -r1.19 -r1.20 src/sys/dev/ic/mpt_netbsd.c
cvs rdiff -u -r1.10 -r1.11 src/sys/dev/ic/mpt_netbsd.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/dev/ic/mpt_netbsd.c
diff -u src/sys/dev/ic/mpt_netbsd.c:1.19 src/sys/dev/ic/mpt_netbsd.c:1.20
--- src/sys/dev/ic/mpt_netbsd.c:1.19	Sun Sep 23 01:13:21 2012
+++ src/sys/dev/ic/mpt_netbsd.c	Tue Apr  1 23:57:54 2014
@@ -1,4 +1,4 @@
-/*	$NetBSD: mpt_netbsd.c,v 1.19 2012/09/23 01:13:21 chs Exp $	*/
+/*	$NetBSD: mpt_netbsd.c,v 1.20 2014/04/01 23:57:54 buhrow Exp $	*/
 
 /*
  * Copyright (c) 2003 Wasabi Systems, Inc.
@@ -77,22 +77,28 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: mpt_netbsd.c,v 1.19 2012/09/23 01:13:21 chs Exp $");
+__KERNEL_RCSID(0, "$NetBSD: mpt_netbsd.c,v 1.20 2014/04/01 23:57:54 buhrow Exp $");
 
 #include <dev/ic/mpt.h>			/* pulls in all headers */
+#include <sys/scsiio.h>
 
 static int	mpt_poll(mpt_softc_t *, struct scsipi_xfer *, int);
 static void	mpt_timeout(void *);
+static void	mpt_restart(mpt_softc_t *, request_t *);
 static void	mpt_done(mpt_softc_t *, uint32_t);
+static int	mpt_drain_queue(mpt_softc_t *);
 static void	mpt_run_xfer(mpt_softc_t *, struct scsipi_xfer *);
 static void	mpt_set_xfer_mode(mpt_softc_t *, struct scsipi_xfer_mode *);
 static void	mpt_get_xfer_mode(mpt_softc_t *, struct scsipi_periph *);
 static void	mpt_ctlop(mpt_softc_t *, void *vmsg, uint32_t);
 static void	mpt_event_notify_reply(mpt_softc_t *, MSG_EVENT_NOTIFY_REPLY *);
+static void  mpt_bus_reset(mpt_softc_t *);
 
 static void	mpt_scsipi_request(struct scsipi_channel *,
 		    scsipi_adapter_req_t, void *);
 static void	mpt_minphys(struct buf *);
+static int 	mpt_ioctl(struct scsipi_channel *, u_long, void *, int,
+	struct proc *);
 
 /*
  * XXX - this assumes the device_private() of the attachement starts with
@@ -121,6 +127,7 @@ mpt_scsipi_attach(mpt_softc_t *mpt)
 	adapt->adapt_max_periph = maxq - 2;
 	adapt->adapt_request = mpt_scsipi_request;
 	adapt->adapt_minphys = mpt_minphys;
+	adapt->adapt_ioctl = mpt_ioctl;
 
 	/* Fill in the scsipi_channel. */
 	memset(chan, 0, sizeof(*chan));
@@ -138,7 +145,8 @@ mpt_scsipi_attach(mpt_softc_t *mpt)
 	chan->chan_ntargets = mpt->mpt_max_devices;
 	chan->chan_id = mpt->mpt_ini_id;
 
-	(void) config_found(mpt->sc_dev, &mpt->sc_channel, scsiprint);
+/*Save the output of the config so we can rescan the bus in case of errors*/
+	mpt->sc_scsibus_dv = config_found(mpt->sc_dev, &mpt->sc_channel, scsiprint);
 }
 
 int
@@ -303,26 +311,11 @@ mpt_intr(void *arg)
 {
 	mpt_softc_t *mpt = arg;
 	int nrepl = 0;
-	uint32_t reply;
 
 	if ((mpt_read(mpt, MPT_OFFSET_INTR_STATUS) & MPT_INTR_REPLY_READY) == 0)
 		return (0);
 
-	reply = mpt_pop_reply_queue(mpt);
-	while (reply != MPT_REPLY_EMPTY) {
-		nrepl++;
-		if (mpt->verbose > 1) {
-			if ((reply & MPT_CONTEXT_REPLY) != 0) {
-				/* Address reply; IOC has something to say */
-				mpt_print_reply(MPT_REPLY_PTOV(mpt, reply));
-			} else {
-				/* Context reply; all went well */
-				mpt_prt(mpt, "context %u reply OK", reply);
-			}
-		}
-		mpt_done(mpt, reply);
-		reply = mpt_pop_reply_queue(mpt);
-	}
+nrepl = mpt_drain_queue(mpt);
 	return (nrepl != 0);
 }
 
@@ -357,13 +350,20 @@ static void
 mpt_timeout(void *arg)
 {
 	request_t *req = arg;
-	struct scsipi_xfer *xs = req->xfer;
-	struct scsipi_periph *periph = xs->xs_periph;
-	mpt_softc_t *mpt = DEV_TO_MPT(
-	    periph->periph_channel->chan_adapter->adapt_dev);
-	uint32_t oseq;
-	int s;
-
+	struct scsipi_xfer *xs;
+	struct scsipi_periph *periph;
+	mpt_softc_t *mpt;
+ 	uint32_t oseq;
+	int s, nrepl = 0;
+ 
+if (req->xfer  == NULL) {
+		printf("mpt_timeout: NULL xfer for request index 0x%x, sequenc 0x%x\n",
+		req->index, req->sequence);
+		return;
+	}
+	xs = req->xfer;
+		periph = xs->xs_periph;
+	mpt = (void *) periph->periph_channel->chan_adapter->adapt_dev;
 	scsipi_printaddr(periph);
 	printf("command timeout\n");
 
@@ -373,11 +373,28 @@ mpt_timeout(void *arg)
 	mpt->timeouts++;
 	if (mpt_intr(mpt)) {
 		if (req->sequence != oseq) {
+			mpt->success ++;
 			mpt_prt(mpt, "recovered from command timeout");
 			splx(s);
 			return;
 		}
 	}
+
+	/*
+	 *Ensure the IOC is really done giving us data since it appears it can
+	 *sometimes fail to give us interrupts under heavy load.
+	 */
+	nrepl = mpt_drain_queue(mpt);
+	if (nrepl ) {
+		mpt_prt(mpt, "mpt_timeout: recovered %d commands",nrepl);
+	}
+
+	if (req->sequence != oseq) {
+		mpt->success ++;
+		splx(s);
+		return;
+	}
+
 	mpt_prt(mpt,
 	    "timeout on request index = 0x%x, seq = 0x%08x",
 	    req->index, req->sequence);
@@ -390,14 +407,83 @@ mpt_timeout(void *arg)
 	if (mpt->verbose > 1)
 		mpt_print_scsi_io_request((MSG_SCSI_IO_REQUEST *)req->req_vbuf);
 
-	/* XXX WHAT IF THE IOC IS STILL USING IT?? */
-	req->xfer = NULL;
-	mpt_free_request(mpt, req);
-
 	xs->error = XS_TIMEOUT;
-	scsipi_done(xs);
+	splx(s);
+	mpt_restart(mpt, req);
+}
+
+static void
+mpt_restart(mpt_softc_t *mpt, request_t *req0)
+{
+	int i, s, nreq;
+	request_t *req;
+	struct scsipi_xfer *xs;
+
+	/* first, reset the IOC, leaving stopped so all requests are idle */
+	if (mpt_soft_reset(mpt) != MPT_OK) {
+		mpt_prt(mpt, "soft reset failed");
+		/* don't try a hard reset since this mangles the PCI configuration registers */
+		return;
+	}
 
+	/* freeze the channel so scsipi doesn't queue more commands */
+	scsipi_channel_freeze(&mpt->sc_channel, 1);
+
+	/* return all pending requests to scsipi and de-allocate them */
+	s = splbio();
+	nreq = 0;
+	for (i = 0; i < MPT_MAX_REQUESTS(mpt); i++) {
+		req = &mpt->request_pool[i];
+		xs = req->xfer;
+		if (xs != NULL) {
+			if (xs->datalen != 0)
+				bus_dmamap_unload(mpt->sc_dmat, req->dmap);
+			req->xfer = NULL;
+			callout_stop(&xs->xs_callout);
+			if (req != req0) {
+				nreq++;
+				xs->error = XS_REQUEUE;
+			}
+			scsipi_done(xs);
+			/* don't really need to mpt_free_request() since mpt_init() below will free all requests anyway */
+			mpt_free_request(mpt, req);
+		}
+	}
 	splx(s);
+	if (nreq > 0)
+		mpt_prt(mpt, "re-queued %d requests", nreq);
+
+	/* re-initialize the IOC (which restarts it) */
+	if (mpt_init(mpt, MPT_DB_INIT_HOST) == 0)
+		mpt_prt(mpt, "restart succeeded");
+	/* else error message already printed */
+
+	/* thaw the channel, causing scsipi to re-queue the commands */
+	scsipi_channel_thaw(&mpt->sc_channel, 1);
+}
+
+static
+int mpt_drain_queue(mpt_softc_t *mpt)
+{
+	int nrepl = 0;
+	uint32_t reply;
+
+	reply = mpt_pop_reply_queue(mpt);
+	while (reply != MPT_REPLY_EMPTY) {
+		nrepl++;
+		if (mpt->verbose > 1) {
+			if ((reply & MPT_CONTEXT_REPLY) != 0) {
+				/* Address reply; IOC has something to say */
+				mpt_print_reply(MPT_REPLY_PTOV(mpt, reply));
+			} else {
+				/* Context reply; all went well */
+				mpt_prt(mpt, "context %u reply OK", reply);
+			}
+		}
+		mpt_done(mpt, reply);
+		reply = mpt_pop_reply_queue(mpt);
+	}
+	return (nrepl);
 }
 
 static void
@@ -409,6 +495,7 @@ mpt_done(mpt_softc_t *mpt, uint32_t repl
 	request_t *req;
 	MSG_REQUEST_HEADER *mpt_req;
 	MSG_SCSI_IO_REPLY *mpt_reply;
+	int restart = 0; /*nonzero if we need to restart the IOC*/
 
 	if (__predict_true((reply & MPT_CONTEXT_REPLY) == 0)) {
 		/* context reply (ok) */
@@ -468,6 +555,8 @@ mpt_done(mpt_softc_t *mpt, uint32_t repl
 	if (__predict_false(mpt_req->Function == MPI_FUNCTION_SCSI_TASK_MGMT)) {
 		if (mpt->verbose > 1)
 			mpt_prt(mpt, "mpt_done: TASK MGMT");
+			KASSERT(req == mpt->mngt_req);
+			mpt->mngt_req = NULL;
 		goto done;
 	}
 
@@ -544,9 +633,10 @@ mpt_done(mpt_softc_t *mpt, uint32_t repl
 	}
 
 	xs->status = mpt_reply->SCSIStatus;
-	switch (le16toh(mpt_reply->IOCStatus)) {
+	switch ((le16toh(mpt_reply->IOCStatus) & MPI_IOCSTATUS_MASK)) {
 	case MPI_IOCSTATUS_SCSI_DATA_OVERRUN:
 		xs->error = XS_DRIVER_STUFFUP;
+		mpt_prt(mpt,"mpt_done: IOC overrun!");
 		break;
 
 	case MPI_IOCSTATUS_SCSI_DATA_UNDERRUN:
@@ -605,30 +695,56 @@ mpt_done(mpt_softc_t *mpt, uint32_t repl
 
 	case MPI_IOCSTATUS_SCSI_RESIDUAL_MISMATCH:
 		xs->error = XS_DRIVER_STUFFUP;
+		mpt_prt(mpt,"mpt_done: IOC SCSI residual mismatch!");
+		restart = 1;
 		break;
 
 	case MPI_IOCSTATUS_SCSI_TASK_TERMINATED:
 		/* XXX What should we do here? */
+		mpt_prt(mpt,"mpt_done: IOC SCSI task terminated!");
+		restart = 1;
 		break;
 
 	case MPI_IOCSTATUS_SCSI_TASK_MGMT_FAILED:
 		/* XXX */
 		xs->error = XS_DRIVER_STUFFUP;
+		mpt_prt(mpt,"mpt_done: IOC SCSI task failed!");
+		restart = 1;
 		break;
 
 	case MPI_IOCSTATUS_SCSI_IOC_TERMINATED:
 		/* XXX */
 		xs->error = XS_DRIVER_STUFFUP;
+		mpt_prt(mpt,"mpt_done: IOC task terminated!");
+		restart = 1;
 		break;
 
 	case MPI_IOCSTATUS_SCSI_EXT_TERMINATED:
 		/* XXX This is a bus-reset */
 		xs->error = XS_DRIVER_STUFFUP;
+		mpt_prt(mpt,"mpt_done: IOC SCSI bus reset!");
+		restart = 1;
+		break;
+
+		case MPI_IOCSTATUS_SCSI_PROTOCOL_ERROR:
+		/*
+		 *FreeBSD and Linux indicate this is a phase error between
+		 *the IOC and the drive itself. 
+		*When this happens, the IOC becomes unhappy and stops processing
+		*all transactions.  Call mpt_timeout which knows how to
+		*get the IOC back on its feet.
+		 */
+		 mpt_prt(mpt,"mpt_done: IOC indicates protocol error -- recovering...");
+		xs->error = XS_TIMEOUT;
+		restart = 1;
+
 		break;
 
 	default:
 		/* XXX unrecognized HBA error */
 		xs->error = XS_DRIVER_STUFFUP;
+		mpt_prt(mpt,"mpt_done: IOC returned unknown code: 0x%x",le16toh(mpt_reply->IOCStatus));
+		restart = 1;
 		break;
 	}
 
@@ -645,6 +761,11 @@ mpt_done(mpt_softc_t *mpt, uint32_t repl
 	}
 
  done:
+	if (le16toh(mpt_reply->IOCStatus) & MPI_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE) {
+		mpt_prt(mpt,"mpt_done: IOC has error - logging...\n");
+		mpt_ctlop(mpt, mpt_reply, reply);
+	}
+
 	/* If IOC done with this requeset, free it up. */
 	if (mpt_reply == NULL || (mpt_reply->MsgFlags & 0x80) == 0)
 		mpt_free_request(mpt, req);
@@ -655,6 +776,11 @@ mpt_done(mpt_softc_t *mpt, uint32_t repl
 
 	if (xs != NULL)
 		scsipi_done(xs);
+
+	if (restart) {
+		mpt_prt(mpt,"mpt_done: IOC fatal error: restarting...");
+		mpt_restart(mpt, NULL);
+	}
 }
 
 static void
@@ -928,6 +1054,12 @@ mpt_run_xfer(mpt_softc_t *mpt, struct sc
 	if (mpt->verbose > 1)
 		mpt_print_scsi_io_request(mpt_req);
 
+		if (xs->timeout == 0) {
+			mpt_prt(mpt,"mpt_run_xfer: no timeout specified for request: 0x%x\n",
+			req->index);
+			xs->timeout = 500;
+		}
+
 	s = splbio();
 	if (__predict_true((xs->xs_control & XS_CTL_POLL) == 0))
 		callout_reset(&xs->xs_callout,
@@ -1340,7 +1472,44 @@ mpt_event_notify_reply(mpt_softc_t *mpt,
 	}
 }
 
-/* XXXJRT mpt_bus_reset() */
+static void
+mpt_bus_reset(mpt_softc_t *mpt)
+{
+	request_t *req;
+	MSG_SCSI_TASK_MGMT *mngt_req;
+	int s;
+
+	s = splbio();
+	if (mpt->mngt_req) {
+		/* request already queued; can't do more */
+		splx(s);
+		return;
+	}
+	req = mpt_get_request(mpt);
+	if (__predict_false(req == NULL)) {
+		mpt_prt(mpt, "no mngt request\n");
+		splx(s);
+		return;
+	}
+	mpt->mngt_req = req;
+	splx(s);
+	mngt_req = req->req_vbuf;
+	memset(mngt_req, 0, sizeof(*mngt_req));
+	mngt_req->Function = MPI_FUNCTION_SCSI_TASK_MGMT;
+	mngt_req->Bus = mpt->bus;
+	mngt_req->TargetID = 0;
+	mngt_req->ChainOffset = 0;
+	mngt_req->TaskType = MPI_SCSITASKMGMT_TASKTYPE_RESET_BUS;
+	mngt_req->Reserved1 = 0;
+	mngt_req->MsgFlags =
+	    mpt->is_fc ? MPI_SCSITASKMGMT_MSGFLAGS_LIP_RESET_OPTION : 0;
+	mngt_req->MsgContext = req->index;
+	mngt_req->TaskMsgContext = 0;
+	s = splbio();
+	mpt_send_handshake_cmd(mpt, sizeof(*mngt_req), mngt_req);
+	/*mpt_enable_ints(mpt);*/
+	splx(s);
+}
 
 /*****************************************************************************
  * SCSI interface routines
@@ -1382,3 +1551,23 @@ mpt_minphys(struct buf *bp)
 		bp->b_bcount = MPT_MAX_XFER;
 	minphys(bp);
 }
+
+static int
+mpt_ioctl(struct scsipi_channel *chan, u_long cmd, void *arg,
+    int flag, struct proc *p)
+{
+	mpt_softc_t *mpt;
+	int s;
+
+	mpt = device_private(chan->chan_adapter->adapt_dev);
+	switch (cmd) {
+	case SCBUSIORESET:
+		mpt_bus_reset(mpt);
+		s = splbio();
+		mpt_intr(mpt);
+		splx(s);
+		return(0);
+	default:
+		return (ENOTTY);
+	}
+}

Index: src/sys/dev/ic/mpt_netbsd.h
diff -u src/sys/dev/ic/mpt_netbsd.h:1.10 src/sys/dev/ic/mpt_netbsd.h:1.11
--- src/sys/dev/ic/mpt_netbsd.h:1.10	Sun Mar 18 21:05:21 2012
+++ src/sys/dev/ic/mpt_netbsd.h	Tue Apr  1 23:57:54 2014
@@ -1,4 +1,4 @@
-/*	$NetBSD: mpt_netbsd.h,v 1.10 2012/03/18 21:05:21 martin Exp $	*/
+/*	$NetBSD: mpt_netbsd.h,v 1.11 2014/04/01 23:57:54 buhrow Exp $	*/
 
 /*
  * Copyright (c) 2003 Wasabi Systems, Inc.
@@ -230,9 +230,11 @@ typedef struct mpt_softc {
 	/* SCSIPI and software management */
 	request_t		*request_pool;
 	SLIST_HEAD(req_queue, req_entry) request_free_list;
+	request_t      *mngt_req;
 
 	struct scsipi_adapter	sc_adapter;
 	struct scsipi_channel	sc_channel;
+	device_t       sc_scsibus_dv; /*So we can rescan in case of errors*/
 
 	uint32_t		sequence;	/* sequence number */
 	uint32_t		timeouts;	/* timeout count */

Reply via email to