[EMAIL PROTECTED] wrote on Thu, 03 Aug 2006 08:19 -0500:
> I don't know when, or if I'll have time to address this limitation in
> the ammasso firmware.  But there is a way (if anyone wants to implement
> it):
> 
> 1) add a timer to the c2_qp struct and start it when c2_llp_connect() is
> called.
> 
> 2) if the timer fires, generate a CONNECT_REPLY upcall to the IWCM with
> status TIMEDOUT.  Mark in the qp that the connect timed out. 
> 
> 3) deal with the rare condition that the timer fires at or about the
> same time the connection really does get established:  if the adapter
> passes up a CCAE_ACTIVE_CONNECT_RESULTS -after- the timer fires but
> before the qp is destroyed by the consumer, then you must squelch this
> event and probably destroy the HWQP at least from the adapter's
> perspective...

Here's a first cut.  It fixes one source of process hangs I had been
running into.  A couple of issues:

    - What is the proper connect timeout?  Old BSD used 24 sec.
    Modern linux seems to be around 3 minutes based on
    experimentation.

    - I used the new hrtimer code.  Backports will need major
    changes to use the old timer interface.

    - I also added code in c2_free_qp() to kill the connection (and
    release the qp ref) to handle the ctrl-C case, or any other
    event that would cause the QP to go away while a connection was
    outstanding.

    - No attempt is made to cleanup hardware state.  That code could
    go in connect_timer_expire(), although there may be issues on
    who is holding what locks when the timer expires.

This against r8688.

                -- Pete

Attempt to work around buggy Ammasso firmware that does not timeout
active connection requests.  Also actively cancels an outstanding
connections when the QP is being freed.

Signed-off-by: Pete Wyckoff <[EMAIL PROTECTED]>

Index: linux-kernel/infiniband/hw/amso1100/c2_qp.c
===================================================================
--- linux-kernel/infiniband/hw/amso1100/c2_qp.c (revision 8688)
+++ linux-kernel/infiniband/hw/amso1100/c2_qp.c (working copy)
@@ -517,6 +517,8 @@
        c2dev->qp_table.map[qp->qpn] = qp;
        spin_unlock_irq(&c2dev->qp_table.lock);
 
+       hrtimer_init(&qp->connect_timer, CLOCK_MONOTONIC, HRTIMER_REL);
+
        return 0;
 
       bail6:
@@ -545,6 +547,13 @@
        recv_cq = to_c2cq(qp->ibqp.recv_cq);
 
        /*
+        * If the timer was still active, a connection attempt is outstanding.
+        * Call the expire function directly to release the ref on the qp.
+        */
+       if (hrtimer_cancel(&qp->connect_timer))
+               qp->connect_timer.function(&qp->connect_timer);
+
+       /*
         * Lock CQs here, so that CQ polling code can do QP lookup
         * without taking a lock.
         */
Index: linux-kernel/infiniband/hw/amso1100/c2_ae.c
===================================================================
--- linux-kernel/infiniband/hw/amso1100/c2_ae.c (revision 8688)
+++ linux-kernel/infiniband/hw/amso1100/c2_ae.c (working copy)
@@ -226,6 +226,14 @@
                                cm_event.private_data_len = 0;
                                cm_event.private_data = NULL;
                        }
+
+                       /*
+                        * Cancel the connect timeout; but if it already
+                        * ran, throw away this hardware connect result
+                        * that raced against it.
+                        */
+                       if (hrtimer_cancel(&qp->connect_timer) == 0)
+                               goto ignore_it;
                        if (cm_event.private_data_len) {
                                /* copy private data */
                                pdata =
Index: linux-kernel/infiniband/hw/amso1100/c2_provider.h
===================================================================
--- linux-kernel/infiniband/hw/amso1100/c2_provider.h   (revision 8688)
+++ linux-kernel/infiniband/hw/amso1100/c2_provider.h   (working copy)
@@ -120,6 +120,7 @@
 
        struct c2_mq sq_mq;
        struct c2_mq rq_mq;
+       struct hrtimer connect_timer;
 };
 
 struct c2_cr_query_attrs {
Index: linux-kernel/infiniband/hw/amso1100/c2_cm.c
===================================================================
--- linux-kernel/infiniband/hw/amso1100/c2_cm.c (revision 8688)
+++ linux-kernel/infiniband/hw/amso1100/c2_cm.c (working copy)
@@ -36,6 +36,22 @@
 #include "c2_vq.h"
 #include <rdma/iw_cm.h>
 
+static int connect_timer_expire(struct hrtimer *timer)
+{
+       struct c2_qp *qp;
+
+       qp = container_of(timer, struct c2_qp, connect_timer);
+       if (qp->cm_id && qp->cm_id->event_handler) {
+               struct iw_cm_event cm_event = {
+                       .event = IW_CM_EVENT_CONNECT_REPLY,
+                       .status = IW_CM_EVENT_STATUS_TIMEOUT,
+               };
+               dprintk("%s: sending connect timeout event\n", __func__);
+               qp->cm_id->event_handler(qp->cm_id, &cm_event);
+       }
+       return HRTIMER_NORESTART;
+}
+
 int c2_llp_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
 {
        struct c2_dev *c2dev = to_c2dev(cm_id->device);
@@ -123,6 +139,15 @@
                cm_id->provider_data = NULL;
                qp->cm_id = NULL;
                cm_id->rem_ref(cm_id);
+       } else {
+               /*
+                * Start connect timer.  Since buggy firmware will not
+                * time out active connections, ever, this timer is used
+                * to force expiry after 30 sec.
+                */
+               qp->connect_timer.function = connect_timer_expire;
+               hrtimer_start(&qp->connect_timer, ktime_set(30, 0),
+                             HRTIMER_REL);
        }
        return err;
 }

_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to