Re: timeout io on mpii(4)

2010-12-30 Thread Mike Belopuhov
On Fri, Dec 24, 2010 at 16:09 +1000, David Gwynne wrote:
 i can reliably produce a situation where an io on a disk attached
 to mpii(4) never completes. this implements timeouts on scsi io so
 we can recover from this situation.
 
 ok?
 

although, you've already committed the change, i have two questions
regarding this.  please find them inline.

 Index: mpii.c
 ===
 RCS file: /cvs/src/sys/dev/pci/mpii.c,v
 retrieving revision 1.35
 diff -u -p -r1.35 mpii.c
 --- mpii.c23 Aug 2010 00:53:36 -  1.35
 +++ mpii.c24 Dec 2010 06:04:38 -
 @@ -4448,6 +4466,7 @@ mpii_scsi_cmd(struct scsi_xfer *xs)
   DNPRINTF(MPII_D_CMD, %s:  Offset0: 0x%02x\n, DEVNAME(sc),
   io-sgl_offset0);
  
 + timeout_set(xs-stimeout, mpii_scsi_cmd_tmo, ccb);
   if (xs-flags  SCSI_POLL) {
   if (mpii_poll(sc, ccb) != 0) {
   xs-error = XS_DRIVER_STUFFUP;
 @@ -4459,10 +4478,66 @@ mpii_scsi_cmd(struct scsi_xfer *xs)
   DNPRINTF(MPII_D_CMD, %s:mpii_scsi_cmd(): opcode: %02x 
   datalen: %d\n, DEVNAME(sc), xs-cmd-opcode, xs-datalen);
  
 + timeout_add_msec(xs-stimeout, xs-timeout);
   mpii_start(sc, ccb);
  }
  
  void
 +mpii_scsi_cmd_tmo(void *xccb)
 +{
 + struct mpii_ccb *ccb = xccb;
 + struct mpii_softc   *sc = ccb-ccb_sc;
 +
 + printf(%s: mpii_scsi_cmd_tmo\n, DEVNAME(sc));
 +
 + mtx_enter(sc-sc_ccb_mtx);
 + if (ccb-ccb_state == MPII_CCB_QUEUED) {
 + ccb-ccb_state = MPII_CCB_TIMEOUT;
 + SLIST_INSERT_HEAD(sc-sc_ccb_tmos, ccb, ccb_link);
 + }
 + mtx_leave(sc-sc_ccb_mtx);
 +
 + scsi_ioh_add(sc-sc_ccb_tmo_handler);
 +}
 +
 +void
 +mpii_scsi_cmd_tmo_handler(void *cookie, void *io)
 +{
 + struct mpii_softc   *sc = cookie;
 + struct mpii_ccb *tccb = io;
 + struct mpii_ccb *ccb;
 + struct mpii_msg_scsi_task_request   *stq;
 +
 + mtx_enter(sc-sc_ccb_mtx);
 + ccb = SLIST_FIRST(sc-sc_ccb_tmos);
 + if (ccb != NULL) {
 + SLIST_REMOVE_HEAD(sc-sc_ccb_tmos, ccb_link);
 + ccb-ccb_state = MPII_CCB_QUEUED;
 + }
 + /* should remove any other ccbs for the same dev handle */
 + mtx_leave(sc-sc_ccb_mtx);
 +
 + if (ccb == NULL) {
 + scsi_io_put(sc-sc_iopool, tccb);
 + return;
 + }
 +
 + stq = tccb-ccb_cmd;
 + stq-function = MPII_FUNCTION_SCSI_TASK_MGMT;
 + stq-task_type = MPII_SCSI_TASK_TARGET_RESET;
 + stq-dev_handle = htole16(ccb-ccb_dev_handle);
 +

why do you issue 'target reset' instead of 'abort task' here?

 + tccb-ccb_done = mpii_scsi_cmd_tmo_done;
 + mpii_start(sc, tccb);
 +}
 +
 +void
 +mpii_scsi_cmd_tmo_done(struct mpii_ccb *tccb)
 +{
 + mpii_scsi_cmd_tmo_handler(tccb-ccb_sc, tccb);
 +}
 +

why do you call this function again from here?



Re: timeout io on mpii(4)

2010-12-30 Thread David Gwynne
On 31/12/2010, at 3:51 AM, Mike Belopuhov wrote:

 On Fri, Dec 24, 2010 at 16:09 +1000, David Gwynne wrote:
 i can reliably produce a situation where an io on a disk attached
 to mpii(4) never completes. this implements timeouts on scsi io so
 we can recover from this situation.

 ok?


 although, you've already committed the change, i have two questions
 regarding this.  please find them inline.

cool :) answers inline too.


 Index: mpii.c
 ===
 RCS file: /cvs/src/sys/dev/pci/mpii.c,v
 retrieving revision 1.35
 diff -u -p -r1.35 mpii.c
 --- mpii.c   23 Aug 2010 00:53:36 -  1.35
 +++ mpii.c   24 Dec 2010 06:04:38 -
 @@ -4448,6 +4466,7 @@ mpii_scsi_cmd(struct scsi_xfer *xs)
  DNPRINTF(MPII_D_CMD, %s:  Offset0: 0x%02x\n, DEVNAME(sc),
  io-sgl_offset0);

 +timeout_set(xs-stimeout, mpii_scsi_cmd_tmo, ccb);
  if (xs-flags  SCSI_POLL) {
  if (mpii_poll(sc, ccb) != 0) {
  xs-error = XS_DRIVER_STUFFUP;
 @@ -4459,10 +4478,66 @@ mpii_scsi_cmd(struct scsi_xfer *xs)
  DNPRINTF(MPII_D_CMD, %s:mpii_scsi_cmd(): opcode: %02x 
  datalen: %d\n, DEVNAME(sc), xs-cmd-opcode, xs-datalen);

 +timeout_add_msec(xs-stimeout, xs-timeout);
  mpii_start(sc, ccb);
 }

 void
 +mpii_scsi_cmd_tmo(void *xccb)
 +{
 +struct mpii_ccb *ccb = xccb;
 +struct mpii_softc   *sc = ccb-ccb_sc;
 +
 +printf(%s: mpii_scsi_cmd_tmo\n, DEVNAME(sc));
 +
 +mtx_enter(sc-sc_ccb_mtx);
 +if (ccb-ccb_state == MPII_CCB_QUEUED) {
 +ccb-ccb_state = MPII_CCB_TIMEOUT;
 +SLIST_INSERT_HEAD(sc-sc_ccb_tmos, ccb, ccb_link);
 +}
 +mtx_leave(sc-sc_ccb_mtx);
 +
 +scsi_ioh_add(sc-sc_ccb_tmo_handler);
 +}
 +
 +void
 +mpii_scsi_cmd_tmo_handler(void *cookie, void *io)
 +{
 +struct mpii_softc   *sc = cookie;
 +struct mpii_ccb *tccb = io;
 +struct mpii_ccb *ccb;
 +struct mpii_msg_scsi_task_request   *stq;
 +
 +mtx_enter(sc-sc_ccb_mtx);
 +ccb = SLIST_FIRST(sc-sc_ccb_tmos);
 +if (ccb != NULL) {
 +SLIST_REMOVE_HEAD(sc-sc_ccb_tmos, ccb_link);
 +ccb-ccb_state = MPII_CCB_QUEUED;
 +}
 +/* should remove any other ccbs for the same dev handle */
 +mtx_leave(sc-sc_ccb_mtx);
 +
 +if (ccb == NULL) {
 +scsi_io_put(sc-sc_iopool, tccb);
 +return;
 +}
 +
 +stq = tccb-ccb_cmd;
 +stq-function = MPII_FUNCTION_SCSI_TASK_MGMT;
 +stq-task_type = MPII_SCSI_TASK_TARGET_RESET;
 +stq-dev_handle = htole16(ccb-ccb_dev_handle);
 +

 why do you issue 'target reset' instead of 'abort task' here?

thats what solaris and linux do.


 +tccb-ccb_done = mpii_scsi_cmd_tmo_done;
 +mpii_start(sc, tccb);
 +}
 +
 +void
 +mpii_scsi_cmd_tmo_done(struct mpii_ccb *tccb)
 +{
 +mpii_scsi_cmd_tmo_handler(tccb-ccb_sc, tccb);
 +}
 +

 why do you call this function again from here?

xs timeouts put the xs on a list to be serviced. the io handler services that
list. the done function calling the handler again is the way it pulls the next
one off the list.

dlg



Re: timeout io on mpii(4)

2010-12-24 Thread Marco Peereboom
This is great but the real question is why does the IO get jammed?

On Fri, Dec 24, 2010 at 04:09:23PM +1000, David Gwynne wrote:
 i can reliably produce a situation where an io on a disk attached
 to mpii(4) never completes. this implements timeouts on scsi io so
 we can recover from this situation.
 
 ok?
 
 Index: mpii.c
 ===
 RCS file: /cvs/src/sys/dev/pci/mpii.c,v
 retrieving revision 1.35
 diff -u -p -r1.35 mpii.c
 --- mpii.c23 Aug 2010 00:53:36 -  1.35
 +++ mpii.c24 Dec 2010 06:04:38 -
 @@ -1757,7 +1757,8 @@ struct mpii_ccb {
   volatile enum {
   MPII_CCB_FREE,
   MPII_CCB_READY,
 - MPII_CCB_QUEUED
 + MPII_CCB_QUEUED,
 + MPII_CCB_TIMEOUT
   }   ccb_state;
  
   void(*ccb_done)(struct mpii_ccb *);
 @@ -1822,6 +1823,15 @@ struct mpii_softc {
   struct mpii_ccb_listsc_ccb_free;
   struct mutexsc_ccb_free_mtx;
  
 + struct mutexsc_ccb_mtx;
 + /*
 +  * this protects the ccb state and list entry
 +  * between mpii_scsi_cmd and scsidone.
 +  */
 +
 + struct mpii_ccb_listsc_ccb_tmos;
 + struct scsi_iohandler   sc_ccb_tmo_handler;
 +
   struct scsi_iopool  sc_iopool;
  
   struct mpii_dmamem  *sc_requests;
 @@ -1894,6 +1904,10 @@ intmpii_alloc_queues(struct mpii_softc
  void mpii_push_reply(struct mpii_softc *, struct mpii_rcb *);
  void mpii_push_replies(struct mpii_softc *);
  
 +void mpii_scsi_cmd_tmo(void *);
 +void mpii_scsi_cmd_tmo_handler(void *, void *);
 +void mpii_scsi_cmd_tmo_done(struct mpii_ccb *);
 +
  int  mpii_alloc_dev(struct mpii_softc *);
  int  mpii_insert_dev(struct mpii_softc *, struct mpii_device *);
  int  mpii_remove_dev(struct mpii_softc *, struct mpii_device *);
 @@ -4035,7 +4049,11 @@ mpii_alloc_ccbs(struct mpii_softc *sc)
   int i;
  
   SLIST_INIT(sc-sc_ccb_free);
 + SLIST_INIT(sc-sc_ccb_tmos);
   mtx_init(sc-sc_ccb_free_mtx, IPL_BIO);
 + mtx_init(sc-sc_ccb_mtx, IPL_BIO);
 + scsi_ioh_set(sc-sc_ccb_tmo_handler, sc-sc_iopool,
 + mpii_scsi_cmd_tmo_handler, sc);
  
   sc-sc_ccbs = malloc(sizeof(*ccb) * (sc-sc_request_depth-1),
   M_DEVBUF, M_NOWAIT | M_ZERO);
 @@ -4448,6 +4466,7 @@ mpii_scsi_cmd(struct scsi_xfer *xs)
   DNPRINTF(MPII_D_CMD, %s:  Offset0: 0x%02x\n, DEVNAME(sc),
   io-sgl_offset0);
  
 + timeout_set(xs-stimeout, mpii_scsi_cmd_tmo, ccb);
   if (xs-flags  SCSI_POLL) {
   if (mpii_poll(sc, ccb) != 0) {
   xs-error = XS_DRIVER_STUFFUP;
 @@ -4459,10 +4478,66 @@ mpii_scsi_cmd(struct scsi_xfer *xs)
   DNPRINTF(MPII_D_CMD, %s:mpii_scsi_cmd(): opcode: %02x 
   datalen: %d\n, DEVNAME(sc), xs-cmd-opcode, xs-datalen);
  
 + timeout_add_msec(xs-stimeout, xs-timeout);
   mpii_start(sc, ccb);
  }
  
  void
 +mpii_scsi_cmd_tmo(void *xccb)
 +{
 + struct mpii_ccb *ccb = xccb;
 + struct mpii_softc   *sc = ccb-ccb_sc;
 +
 + printf(%s: mpii_scsi_cmd_tmo\n, DEVNAME(sc));
 +
 + mtx_enter(sc-sc_ccb_mtx);
 + if (ccb-ccb_state == MPII_CCB_QUEUED) {
 + ccb-ccb_state = MPII_CCB_TIMEOUT;
 + SLIST_INSERT_HEAD(sc-sc_ccb_tmos, ccb, ccb_link);
 + }
 + mtx_leave(sc-sc_ccb_mtx);
 +
 + scsi_ioh_add(sc-sc_ccb_tmo_handler);
 +}
 +
 +void
 +mpii_scsi_cmd_tmo_handler(void *cookie, void *io)
 +{
 + struct mpii_softc   *sc = cookie;
 + struct mpii_ccb *tccb = io;
 + struct mpii_ccb *ccb;
 + struct mpii_msg_scsi_task_request   *stq;
 +
 + mtx_enter(sc-sc_ccb_mtx);
 + ccb = SLIST_FIRST(sc-sc_ccb_tmos);
 + if (ccb != NULL) {
 + SLIST_REMOVE_HEAD(sc-sc_ccb_tmos, ccb_link);
 + ccb-ccb_state = MPII_CCB_QUEUED;
 + }
 + /* should remove any other ccbs for the same dev handle */
 + mtx_leave(sc-sc_ccb_mtx);
 +
 + if (ccb == NULL) {
 + scsi_io_put(sc-sc_iopool, tccb);
 + return;
 + }
 +
 + stq = tccb-ccb_cmd;
 + stq-function = MPII_FUNCTION_SCSI_TASK_MGMT;
 + stq-task_type = MPII_SCSI_TASK_TARGET_RESET;
 + stq-dev_handle = htole16(ccb-ccb_dev_handle);
 +
 + tccb-ccb_done = mpii_scsi_cmd_tmo_done;
 + mpii_start(sc, tccb);
 +}
 +
 +void
 +mpii_scsi_cmd_tmo_done(struct mpii_ccb *tccb)
 +{
 + mpii_scsi_cmd_tmo_handler(tccb-ccb_sc, tccb);
 +}
 +
 +void
  mpii_scsi_cmd_done(struct mpii_ccb *ccb)
  {
   struct mpii_msg_scsi_io_error   *sie;
 @@ -4470,6 +4545,14 @@ mpii_scsi_cmd_done(struct mpii_ccb *ccb)
   struct scsi_xfer*xs = ccb-ccb_cookie;
   struct