Module Name: src Committed By: buhrow Date: Tue Apr 1 23:57:54 UTC 2014
Modified Files: src/sys/dev/ic: mpt_netbsd.c mpt_netbsd.h Log Message: Checking in changes to improve error handling. Specifically: - if commands timeout, clear the queues to the the card and perform a soft reset on the LSI hardware since when these timeouts occur, the LSI firmware is not graceful about recovering at all. - Recover gracefully from more kinds of errors using the same recovery mechanism listed above. Also, implement mpt_ioctl() to handle bus reset requests from scsictl(8). To generate a diff of this commit: cvs rdiff -u -r1.19 -r1.20 src/sys/dev/ic/mpt_netbsd.c cvs rdiff -u -r1.10 -r1.11 src/sys/dev/ic/mpt_netbsd.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/dev/ic/mpt_netbsd.c diff -u src/sys/dev/ic/mpt_netbsd.c:1.19 src/sys/dev/ic/mpt_netbsd.c:1.20 --- src/sys/dev/ic/mpt_netbsd.c:1.19 Sun Sep 23 01:13:21 2012 +++ src/sys/dev/ic/mpt_netbsd.c Tue Apr 1 23:57:54 2014 @@ -1,4 +1,4 @@ -/* $NetBSD: mpt_netbsd.c,v 1.19 2012/09/23 01:13:21 chs Exp $ */ +/* $NetBSD: mpt_netbsd.c,v 1.20 2014/04/01 23:57:54 buhrow Exp $ */ /* * Copyright (c) 2003 Wasabi Systems, Inc. @@ -77,22 +77,28 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: mpt_netbsd.c,v 1.19 2012/09/23 01:13:21 chs Exp $"); +__KERNEL_RCSID(0, "$NetBSD: mpt_netbsd.c,v 1.20 2014/04/01 23:57:54 buhrow Exp $"); #include <dev/ic/mpt.h> /* pulls in all headers */ +#include <sys/scsiio.h> static int mpt_poll(mpt_softc_t *, struct scsipi_xfer *, int); static void mpt_timeout(void *); +static void mpt_restart(mpt_softc_t *, request_t *); static void mpt_done(mpt_softc_t *, uint32_t); +static int mpt_drain_queue(mpt_softc_t *); static void mpt_run_xfer(mpt_softc_t *, struct scsipi_xfer *); static void mpt_set_xfer_mode(mpt_softc_t *, struct scsipi_xfer_mode *); static void mpt_get_xfer_mode(mpt_softc_t *, struct scsipi_periph *); static void mpt_ctlop(mpt_softc_t *, void *vmsg, uint32_t); static void mpt_event_notify_reply(mpt_softc_t *, MSG_EVENT_NOTIFY_REPLY *); +static void mpt_bus_reset(mpt_softc_t *); static void mpt_scsipi_request(struct scsipi_channel *, scsipi_adapter_req_t, void *); static void mpt_minphys(struct buf *); +static int mpt_ioctl(struct scsipi_channel *, u_long, void *, int, + struct proc *); /* * XXX - this assumes the device_private() of the attachement starts with @@ -121,6 +127,7 @@ mpt_scsipi_attach(mpt_softc_t *mpt) adapt->adapt_max_periph = maxq - 2; adapt->adapt_request = mpt_scsipi_request; adapt->adapt_minphys = mpt_minphys; + adapt->adapt_ioctl = mpt_ioctl; /* Fill in the scsipi_channel. */ memset(chan, 0, sizeof(*chan)); @@ -138,7 +145,8 @@ mpt_scsipi_attach(mpt_softc_t *mpt) chan->chan_ntargets = mpt->mpt_max_devices; chan->chan_id = mpt->mpt_ini_id; - (void) config_found(mpt->sc_dev, &mpt->sc_channel, scsiprint); +/*Save the output of the config so we can rescan the bus in case of errors*/ + mpt->sc_scsibus_dv = config_found(mpt->sc_dev, &mpt->sc_channel, scsiprint); } int @@ -303,26 +311,11 @@ mpt_intr(void *arg) { mpt_softc_t *mpt = arg; int nrepl = 0; - uint32_t reply; if ((mpt_read(mpt, MPT_OFFSET_INTR_STATUS) & MPT_INTR_REPLY_READY) == 0) return (0); - reply = mpt_pop_reply_queue(mpt); - while (reply != MPT_REPLY_EMPTY) { - nrepl++; - if (mpt->verbose > 1) { - if ((reply & MPT_CONTEXT_REPLY) != 0) { - /* Address reply; IOC has something to say */ - mpt_print_reply(MPT_REPLY_PTOV(mpt, reply)); - } else { - /* Context reply; all went well */ - mpt_prt(mpt, "context %u reply OK", reply); - } - } - mpt_done(mpt, reply); - reply = mpt_pop_reply_queue(mpt); - } +nrepl = mpt_drain_queue(mpt); return (nrepl != 0); } @@ -357,13 +350,20 @@ static void mpt_timeout(void *arg) { request_t *req = arg; - struct scsipi_xfer *xs = req->xfer; - struct scsipi_periph *periph = xs->xs_periph; - mpt_softc_t *mpt = DEV_TO_MPT( - periph->periph_channel->chan_adapter->adapt_dev); - uint32_t oseq; - int s; - + struct scsipi_xfer *xs; + struct scsipi_periph *periph; + mpt_softc_t *mpt; + uint32_t oseq; + int s, nrepl = 0; + +if (req->xfer == NULL) { + printf("mpt_timeout: NULL xfer for request index 0x%x, sequenc 0x%x\n", + req->index, req->sequence); + return; + } + xs = req->xfer; + periph = xs->xs_periph; + mpt = (void *) periph->periph_channel->chan_adapter->adapt_dev; scsipi_printaddr(periph); printf("command timeout\n"); @@ -373,11 +373,28 @@ mpt_timeout(void *arg) mpt->timeouts++; if (mpt_intr(mpt)) { if (req->sequence != oseq) { + mpt->success ++; mpt_prt(mpt, "recovered from command timeout"); splx(s); return; } } + + /* + *Ensure the IOC is really done giving us data since it appears it can + *sometimes fail to give us interrupts under heavy load. + */ + nrepl = mpt_drain_queue(mpt); + if (nrepl ) { + mpt_prt(mpt, "mpt_timeout: recovered %d commands",nrepl); + } + + if (req->sequence != oseq) { + mpt->success ++; + splx(s); + return; + } + mpt_prt(mpt, "timeout on request index = 0x%x, seq = 0x%08x", req->index, req->sequence); @@ -390,14 +407,83 @@ mpt_timeout(void *arg) if (mpt->verbose > 1) mpt_print_scsi_io_request((MSG_SCSI_IO_REQUEST *)req->req_vbuf); - /* XXX WHAT IF THE IOC IS STILL USING IT?? */ - req->xfer = NULL; - mpt_free_request(mpt, req); - xs->error = XS_TIMEOUT; - scsipi_done(xs); + splx(s); + mpt_restart(mpt, req); +} + +static void +mpt_restart(mpt_softc_t *mpt, request_t *req0) +{ + int i, s, nreq; + request_t *req; + struct scsipi_xfer *xs; + + /* first, reset the IOC, leaving stopped so all requests are idle */ + if (mpt_soft_reset(mpt) != MPT_OK) { + mpt_prt(mpt, "soft reset failed"); + /* don't try a hard reset since this mangles the PCI configuration registers */ + return; + } + /* freeze the channel so scsipi doesn't queue more commands */ + scsipi_channel_freeze(&mpt->sc_channel, 1); + + /* return all pending requests to scsipi and de-allocate them */ + s = splbio(); + nreq = 0; + for (i = 0; i < MPT_MAX_REQUESTS(mpt); i++) { + req = &mpt->request_pool[i]; + xs = req->xfer; + if (xs != NULL) { + if (xs->datalen != 0) + bus_dmamap_unload(mpt->sc_dmat, req->dmap); + req->xfer = NULL; + callout_stop(&xs->xs_callout); + if (req != req0) { + nreq++; + xs->error = XS_REQUEUE; + } + scsipi_done(xs); + /* don't really need to mpt_free_request() since mpt_init() below will free all requests anyway */ + mpt_free_request(mpt, req); + } + } splx(s); + if (nreq > 0) + mpt_prt(mpt, "re-queued %d requests", nreq); + + /* re-initialize the IOC (which restarts it) */ + if (mpt_init(mpt, MPT_DB_INIT_HOST) == 0) + mpt_prt(mpt, "restart succeeded"); + /* else error message already printed */ + + /* thaw the channel, causing scsipi to re-queue the commands */ + scsipi_channel_thaw(&mpt->sc_channel, 1); +} + +static +int mpt_drain_queue(mpt_softc_t *mpt) +{ + int nrepl = 0; + uint32_t reply; + + reply = mpt_pop_reply_queue(mpt); + while (reply != MPT_REPLY_EMPTY) { + nrepl++; + if (mpt->verbose > 1) { + if ((reply & MPT_CONTEXT_REPLY) != 0) { + /* Address reply; IOC has something to say */ + mpt_print_reply(MPT_REPLY_PTOV(mpt, reply)); + } else { + /* Context reply; all went well */ + mpt_prt(mpt, "context %u reply OK", reply); + } + } + mpt_done(mpt, reply); + reply = mpt_pop_reply_queue(mpt); + } + return (nrepl); } static void @@ -409,6 +495,7 @@ mpt_done(mpt_softc_t *mpt, uint32_t repl request_t *req; MSG_REQUEST_HEADER *mpt_req; MSG_SCSI_IO_REPLY *mpt_reply; + int restart = 0; /*nonzero if we need to restart the IOC*/ if (__predict_true((reply & MPT_CONTEXT_REPLY) == 0)) { /* context reply (ok) */ @@ -468,6 +555,8 @@ mpt_done(mpt_softc_t *mpt, uint32_t repl if (__predict_false(mpt_req->Function == MPI_FUNCTION_SCSI_TASK_MGMT)) { if (mpt->verbose > 1) mpt_prt(mpt, "mpt_done: TASK MGMT"); + KASSERT(req == mpt->mngt_req); + mpt->mngt_req = NULL; goto done; } @@ -544,9 +633,10 @@ mpt_done(mpt_softc_t *mpt, uint32_t repl } xs->status = mpt_reply->SCSIStatus; - switch (le16toh(mpt_reply->IOCStatus)) { + switch ((le16toh(mpt_reply->IOCStatus) & MPI_IOCSTATUS_MASK)) { case MPI_IOCSTATUS_SCSI_DATA_OVERRUN: xs->error = XS_DRIVER_STUFFUP; + mpt_prt(mpt,"mpt_done: IOC overrun!"); break; case MPI_IOCSTATUS_SCSI_DATA_UNDERRUN: @@ -605,30 +695,56 @@ mpt_done(mpt_softc_t *mpt, uint32_t repl case MPI_IOCSTATUS_SCSI_RESIDUAL_MISMATCH: xs->error = XS_DRIVER_STUFFUP; + mpt_prt(mpt,"mpt_done: IOC SCSI residual mismatch!"); + restart = 1; break; case MPI_IOCSTATUS_SCSI_TASK_TERMINATED: /* XXX What should we do here? */ + mpt_prt(mpt,"mpt_done: IOC SCSI task terminated!"); + restart = 1; break; case MPI_IOCSTATUS_SCSI_TASK_MGMT_FAILED: /* XXX */ xs->error = XS_DRIVER_STUFFUP; + mpt_prt(mpt,"mpt_done: IOC SCSI task failed!"); + restart = 1; break; case MPI_IOCSTATUS_SCSI_IOC_TERMINATED: /* XXX */ xs->error = XS_DRIVER_STUFFUP; + mpt_prt(mpt,"mpt_done: IOC task terminated!"); + restart = 1; break; case MPI_IOCSTATUS_SCSI_EXT_TERMINATED: /* XXX This is a bus-reset */ xs->error = XS_DRIVER_STUFFUP; + mpt_prt(mpt,"mpt_done: IOC SCSI bus reset!"); + restart = 1; + break; + + case MPI_IOCSTATUS_SCSI_PROTOCOL_ERROR: + /* + *FreeBSD and Linux indicate this is a phase error between + *the IOC and the drive itself. + *When this happens, the IOC becomes unhappy and stops processing + *all transactions. Call mpt_timeout which knows how to + *get the IOC back on its feet. + */ + mpt_prt(mpt,"mpt_done: IOC indicates protocol error -- recovering..."); + xs->error = XS_TIMEOUT; + restart = 1; + break; default: /* XXX unrecognized HBA error */ xs->error = XS_DRIVER_STUFFUP; + mpt_prt(mpt,"mpt_done: IOC returned unknown code: 0x%x",le16toh(mpt_reply->IOCStatus)); + restart = 1; break; } @@ -645,6 +761,11 @@ mpt_done(mpt_softc_t *mpt, uint32_t repl } done: + if (le16toh(mpt_reply->IOCStatus) & MPI_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE) { + mpt_prt(mpt,"mpt_done: IOC has error - logging...\n"); + mpt_ctlop(mpt, mpt_reply, reply); + } + /* If IOC done with this requeset, free it up. */ if (mpt_reply == NULL || (mpt_reply->MsgFlags & 0x80) == 0) mpt_free_request(mpt, req); @@ -655,6 +776,11 @@ mpt_done(mpt_softc_t *mpt, uint32_t repl if (xs != NULL) scsipi_done(xs); + + if (restart) { + mpt_prt(mpt,"mpt_done: IOC fatal error: restarting..."); + mpt_restart(mpt, NULL); + } } static void @@ -928,6 +1054,12 @@ mpt_run_xfer(mpt_softc_t *mpt, struct sc if (mpt->verbose > 1) mpt_print_scsi_io_request(mpt_req); + if (xs->timeout == 0) { + mpt_prt(mpt,"mpt_run_xfer: no timeout specified for request: 0x%x\n", + req->index); + xs->timeout = 500; + } + s = splbio(); if (__predict_true((xs->xs_control & XS_CTL_POLL) == 0)) callout_reset(&xs->xs_callout, @@ -1340,7 +1472,44 @@ mpt_event_notify_reply(mpt_softc_t *mpt, } } -/* XXXJRT mpt_bus_reset() */ +static void +mpt_bus_reset(mpt_softc_t *mpt) +{ + request_t *req; + MSG_SCSI_TASK_MGMT *mngt_req; + int s; + + s = splbio(); + if (mpt->mngt_req) { + /* request already queued; can't do more */ + splx(s); + return; + } + req = mpt_get_request(mpt); + if (__predict_false(req == NULL)) { + mpt_prt(mpt, "no mngt request\n"); + splx(s); + return; + } + mpt->mngt_req = req; + splx(s); + mngt_req = req->req_vbuf; + memset(mngt_req, 0, sizeof(*mngt_req)); + mngt_req->Function = MPI_FUNCTION_SCSI_TASK_MGMT; + mngt_req->Bus = mpt->bus; + mngt_req->TargetID = 0; + mngt_req->ChainOffset = 0; + mngt_req->TaskType = MPI_SCSITASKMGMT_TASKTYPE_RESET_BUS; + mngt_req->Reserved1 = 0; + mngt_req->MsgFlags = + mpt->is_fc ? MPI_SCSITASKMGMT_MSGFLAGS_LIP_RESET_OPTION : 0; + mngt_req->MsgContext = req->index; + mngt_req->TaskMsgContext = 0; + s = splbio(); + mpt_send_handshake_cmd(mpt, sizeof(*mngt_req), mngt_req); + /*mpt_enable_ints(mpt);*/ + splx(s); +} /***************************************************************************** * SCSI interface routines @@ -1382,3 +1551,23 @@ mpt_minphys(struct buf *bp) bp->b_bcount = MPT_MAX_XFER; minphys(bp); } + +static int +mpt_ioctl(struct scsipi_channel *chan, u_long cmd, void *arg, + int flag, struct proc *p) +{ + mpt_softc_t *mpt; + int s; + + mpt = device_private(chan->chan_adapter->adapt_dev); + switch (cmd) { + case SCBUSIORESET: + mpt_bus_reset(mpt); + s = splbio(); + mpt_intr(mpt); + splx(s); + return(0); + default: + return (ENOTTY); + } +} Index: src/sys/dev/ic/mpt_netbsd.h diff -u src/sys/dev/ic/mpt_netbsd.h:1.10 src/sys/dev/ic/mpt_netbsd.h:1.11 --- src/sys/dev/ic/mpt_netbsd.h:1.10 Sun Mar 18 21:05:21 2012 +++ src/sys/dev/ic/mpt_netbsd.h Tue Apr 1 23:57:54 2014 @@ -1,4 +1,4 @@ -/* $NetBSD: mpt_netbsd.h,v 1.10 2012/03/18 21:05:21 martin Exp $ */ +/* $NetBSD: mpt_netbsd.h,v 1.11 2014/04/01 23:57:54 buhrow Exp $ */ /* * Copyright (c) 2003 Wasabi Systems, Inc. @@ -230,9 +230,11 @@ typedef struct mpt_softc { /* SCSIPI and software management */ request_t *request_pool; SLIST_HEAD(req_queue, req_entry) request_free_list; + request_t *mngt_req; struct scsipi_adapter sc_adapter; struct scsipi_channel sc_channel; + device_t sc_scsibus_dv; /*So we can rescan in case of errors*/ uint32_t sequence; /* sequence number */ uint32_t timeouts; /* timeout count */