> resources (PZ, etc.). We must have been getting lucky on IB.
> 
> I'm worried that a callback could occur ...
> 
>   void dapli_destroy_conn(struct dapl_cm_id *conn)
>   {
>           int in_callback;
>           struct rdma_cm_id *cm_id;
>   
>           dapl_dbg_log(DAPL_DBG_TYPE_CM,
>                        " destroy_conn: conn %p id %d\n",
>                        conn,conn->cm_id);
> 
>           dapl_os_lock(&conn->lock);
>           conn->destroy = 1;
>           in_callback = conn->in_callback;
>           do {
>                   if (in_callback) {
>                           dapl_os_unlock(&conn->lock);
>                           usleep(10);
>                           dapl_os_lock(&conn->lock);
>                   }
>                   in_callback = conn->in_callback;
>           } while (in_callback);
>   
>           if (conn->ep)
>                   conn->ep->cm_handle = IB_INVALID_HANDLE;
>           cm_id = conn->cm_id;
>           conn->cm_id = NULL;
>           dapl_os_unlock(&conn->lock);
> 
> /* ... here */
> 
>           if (cm_id) {
>                   if (cm_id->qp)
>                           rdma_destroy_qp(cm_id);
>                   rdma_destroy_id(cm_id);
>           }
>           dapl_os_free(conn, sizeof(*conn));
>   }
> 
> Destroying the cm_id while in a callback would be bad.
> 
> >  static struct dapl_cm_id * dapli_req_recv(struct dapl_cm_id *conn,
> > @@ -243,11 +249,9 @@
> >     return new_conn;
> >  }
> >  
> > -static int dapli_cm_active_cb(struct dapl_cm_id *conn,
> > +static void dapli_cm_active_cb(struct dapl_cm_id *conn,
> >                           struct rdma_cm_event *event)
> >  {
> > -   int destroy;
> > -
> >     dapl_dbg_log(DAPL_DBG_TYPE_CM, 
> >                  " active_cb: conn %p id %d event %d\n",
> >                  conn, conn->cm_id, event->event );
> > @@ -255,7 +259,7 @@
> >     dapl_os_lock(&conn->lock);
> >     if (conn->destroy) {
> >             dapl_os_unlock(&conn->lock);
> > -           return 0;
> > +           return;
> >     }
> >     conn->in_callback = 1;
> >     dapl_os_unlock(&conn->lock);
> > @@ -303,16 +307,14 @@
> >     }
> >  
> >     dapl_os_lock(&conn->lock);
> > -   destroy = conn->destroy;
> > -   conn->in_callback = conn->destroy;
> > +   conn->in_callback = 0;
> >     dapl_os_unlock(&conn->lock);
> > -   return(destroy);
> > +   return;
> >  }
> >  
> > -static int dapli_cm_passive_cb(struct dapl_cm_id *conn,
> > +static void dapli_cm_passive_cb(struct dapl_cm_id *conn,
> >                            struct rdma_cm_event *event)
> >  {
> > -   int destroy;
> >     struct dapl_cm_id *new_conn;
> >  
> >     dapl_dbg_log(DAPL_DBG_TYPE_CM, 
> > @@ -322,7 +324,7 @@
> >     dapl_os_lock(&conn->lock);
> >     if (conn->destroy) {
> >             dapl_os_unlock(&conn->lock);
> > -           return 0;
> > +           return;
> >     }
> >     conn->in_callback = 1;
> >     dapl_os_unlock(&conn->lock);
> > @@ -377,10 +379,9 @@
> >     }
> >  
> >     dapl_os_lock(&conn->lock);
> > -   destroy = conn->destroy;
> > -   conn->in_callback = conn->destroy;
> > +   conn->in_callback = 0;
> >     dapl_os_unlock(&conn->lock);
> > -   return(destroy);
> > +   return;
> >  }
> >  
> > 
> > @@ -1021,7 +1022,6 @@
> >     /* process one CM event, fairness */
> >     if(!ret) {
> >             struct dapl_cm_id *conn;
> > -           int ret;
> >                             
> >             /* set proper conn from cm_id context*/
> >             if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
> > @@ -1059,24 +1059,9 @@
> >             case RDMA_CM_EVENT_DISCONNECTED:
> >                     /* passive or active */
> >                     if (conn->sp) 
> > -                           ret = dapli_cm_passive_cb(conn,event);
> > +                           dapli_cm_passive_cb(conn,event);
> >                     else 
> > -                           ret = dapli_cm_active_cb(conn,event);
> > -
> > -                   /* destroy both qp and cm_id */
> > -                   if (ret) {
> > -                           dapl_dbg_log(DAPL_DBG_TYPE_CM, 
> > -                                        " cma_cb: DESTROY conn %p" 
> > -                                        " cm_id %p qp %p\n",
> > -                                        conn, conn->cm_id, 
> > -                                        conn->cm_id->qp);
> > -   
> > -                           if (conn->cm_id->qp)
> > -                                   rdma_destroy_qp(conn->cm_id);
> > -
> > -                           rdma_destroy_id(conn->cm_id);
> > -                           dapl_os_free(conn, sizeof(*conn));
> > -                   }
> > +                           dapli_cm_active_cb(conn,event);
> 
> If this code is removed, we'll need to update functions that set 
> conn->destroy to 1 to destroy the cm_id.
> 

I thought only dapli_destroy_conn() sets this, but now I see that
dapli_thread() can also set this so I'll go check this out.

> What happens if a consumer attempts to free the EP from a callback? 

There are no direct consumer callbacks in usermode are there?  consumers
call dat_evd_wait() or whatever and get scheduled.  Not like kernel
mode...  Or am I confused?

> With this change (or any one that blocked a callback thread from 
> attempting to free the EP), I believe we would deadlock. 
> 
> Is it possible to destroy CM IDs from within a callback? 

I'll go verify this again, but I don't think the callback dapl thread
does this.  

Stevo.


_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to