Pete, this looks good.  

I'm going to hold off on pulling this in until I get a word from Roland
on if he's pulling in the driver patch to 2.6.19.  I'd rather get what I
have in to Roland's git tree, then add this as a 1-off patch.  

Roland, what do you think?

Thanks,
Steve.

On Fri, 2006-08-04 at 16:08 -0400, Pete Wyckoff wrote:
> [EMAIL PROTECTED] wrote on Thu, 03 Aug 2006 08:19 -0500:
> > I don't know when, or if I'll have time to address this limitation in
> > the ammasso firmware.  But there is a way (if anyone wants to implement
> > it):
> > 
> > 1) add a timer to the c2_qp struct and start it when c2_llp_connect() is
> > called.
> > 
> > 2) if the timer fires, generate a CONNECT_REPLY upcall to the IWCM with
> > status TIMEDOUT.  Mark in the qp that the connect timed out. 
> > 
> > 3) deal with the rare condition that the timer fires at or about the
> > same time the connection really does get established:  if the adapter
> > passes up a CCAE_ACTIVE_CONNECT_RESULTS -after- the timer fires but
> > before the qp is destroyed by the consumer, then you must squelch this
> > event and probably destroy the HWQP at least from the adapter's
> > perspective...
> 
> Here's a first cut.  It fixes one source of process hangs I had been
> running into.  A couple of issues:
> 
>     - What is the proper connect timeout?  Old BSD used 24 sec.
>     Modern linux seems to be around 3 minutes based on
>     experimentation.
> 
>     - I used the new hrtimer code.  Backports will need major
>     changes to use the old timer interface.
> 
>     - I also added code in c2_free_qp() to kill the connection (and
>     release the qp ref) to handle the ctrl-C case, or any other
>     event that would cause the QP to go away while a connection was
>     outstanding.
> 
>     - No attempt is made to cleanup hardware state.  That code could
>     go in connect_timer_expire(), although there may be issues on
>     who is holding what locks when the timer expires.
> 
> This against r8688.
> 
>               -- Pete
> 
> Attempt to work around buggy Ammasso firmware that does not timeout
> active connection requests.  Also actively cancels an outstanding
> connections when the QP is being freed.
> 
> Signed-off-by: Pete Wyckoff <[EMAIL PROTECTED]>
> 
> Index: linux-kernel/infiniband/hw/amso1100/c2_qp.c
> ===================================================================
> --- linux-kernel/infiniband/hw/amso1100/c2_qp.c       (revision 8688)
> +++ linux-kernel/infiniband/hw/amso1100/c2_qp.c       (working copy)
> @@ -517,6 +517,8 @@
>       c2dev->qp_table.map[qp->qpn] = qp;
>       spin_unlock_irq(&c2dev->qp_table.lock);
>  
> +     hrtimer_init(&qp->connect_timer, CLOCK_MONOTONIC, HRTIMER_REL);
> +
>       return 0;
>  
>        bail6:
> @@ -545,6 +547,13 @@
>       recv_cq = to_c2cq(qp->ibqp.recv_cq);
>  
>       /*
> +      * If the timer was still active, a connection attempt is outstanding.
> +      * Call the expire function directly to release the ref on the qp.
> +      */
> +     if (hrtimer_cancel(&qp->connect_timer))
> +             qp->connect_timer.function(&qp->connect_timer);
> +
> +     /*
>        * Lock CQs here, so that CQ polling code can do QP lookup
>        * without taking a lock.
>        */
> Index: linux-kernel/infiniband/hw/amso1100/c2_ae.c
> ===================================================================
> --- linux-kernel/infiniband/hw/amso1100/c2_ae.c       (revision 8688)
> +++ linux-kernel/infiniband/hw/amso1100/c2_ae.c       (working copy)
> @@ -226,6 +226,14 @@
>                               cm_event.private_data_len = 0;
>                               cm_event.private_data = NULL;
>                       }
> +
> +                     /*
> +                      * Cancel the connect timeout; but if it already
> +                      * ran, throw away this hardware connect result
> +                      * that raced against it.
> +                      */
> +                     if (hrtimer_cancel(&qp->connect_timer) == 0)
> +                             goto ignore_it;
>                       if (cm_event.private_data_len) {
>                               /* copy private data */
>                               pdata =
> Index: linux-kernel/infiniband/hw/amso1100/c2_provider.h
> ===================================================================
> --- linux-kernel/infiniband/hw/amso1100/c2_provider.h (revision 8688)
> +++ linux-kernel/infiniband/hw/amso1100/c2_provider.h (working copy)
> @@ -120,6 +120,7 @@
>  
>       struct c2_mq sq_mq;
>       struct c2_mq rq_mq;
> +     struct hrtimer connect_timer;
>  };
>  
>  struct c2_cr_query_attrs {
> Index: linux-kernel/infiniband/hw/amso1100/c2_cm.c
> ===================================================================
> --- linux-kernel/infiniband/hw/amso1100/c2_cm.c       (revision 8688)
> +++ linux-kernel/infiniband/hw/amso1100/c2_cm.c       (working copy)
> @@ -36,6 +36,22 @@
>  #include "c2_vq.h"
>  #include <rdma/iw_cm.h>
>  
> +static int connect_timer_expire(struct hrtimer *timer)
> +{
> +     struct c2_qp *qp;
> +
> +     qp = container_of(timer, struct c2_qp, connect_timer);
> +     if (qp->cm_id && qp->cm_id->event_handler) {
> +             struct iw_cm_event cm_event = {
> +                     .event = IW_CM_EVENT_CONNECT_REPLY,
> +                     .status = IW_CM_EVENT_STATUS_TIMEOUT,
> +             };
> +             dprintk("%s: sending connect timeout event\n", __func__);
> +             qp->cm_id->event_handler(qp->cm_id, &cm_event);
> +     }
> +     return HRTIMER_NORESTART;
> +}
> +
>  int c2_llp_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
>  {
>       struct c2_dev *c2dev = to_c2dev(cm_id->device);
> @@ -123,6 +139,15 @@
>               cm_id->provider_data = NULL;
>               qp->cm_id = NULL;
>               cm_id->rem_ref(cm_id);
> +     } else {
> +             /*
> +              * Start connect timer.  Since buggy firmware will not
> +              * time out active connections, ever, this timer is used
> +              * to force expiry after 30 sec.
> +              */
> +             qp->connect_timer.function = connect_timer_expire;
> +             hrtimer_start(&qp->connect_timer, ktime_set(30, 0),
> +                           HRTIMER_REL);
>       }
>       return err;
>  }


_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to