Comments below:

On Tue, 4 Apr 2006, Steve Wise wrote:

> Index: openib_cma/dapl_ib_cm.c
> ===================================================================
> --- openib_cma/dapl_ib_cm.c   (revision 6107)
> +++ openib_cma/dapl_ib_cm.c   (working copy)
> @@ -62,9 +62,9 @@
>  /* local prototypes */
>  static struct dapl_cm_id * dapli_req_recv(struct dapl_cm_id *conn, 
>                                         struct rdma_cm_event *event);
> -static int dapli_cm_active_cb(struct dapl_cm_id *conn, 
> +static void dapli_cm_active_cb(struct dapl_cm_id *conn, 
>                             struct rdma_cm_event *event);
> -static int dapli_cm_passive_cb(struct dapl_cm_id *conn, 
> +static void dapli_cm_passive_cb(struct dapl_cm_id *conn, 
>                              struct rdma_cm_event *event);
>  static void dapli_addr_resolve(struct dapl_cm_id *conn);
>  static void dapli_route_resolve(struct dapl_cm_id *conn);
> @@ -164,28 +164,34 @@
>  void dapli_destroy_conn(struct dapl_cm_id *conn)
>  {
>       int in_callback;
> +     struct rdma_cm_id *cm_id;
>  
>       dapl_dbg_log(DAPL_DBG_TYPE_CM, 
>                    " destroy_conn: conn %p id %d\n",
>                    conn,conn->cm_id);
> -
>       dapl_os_lock(&conn->lock);
>       conn->destroy = 1;
>       in_callback = conn->in_callback;
> -     dapl_os_unlock(&conn->lock);
> -
> -     if (!in_callback) {
> -             if (conn->ep)
> -                     conn->ep->cm_handle = IB_INVALID_HANDLE;
> -             if (conn->cm_id) {
> -                     if (conn->cm_id->qp)
> -                             rdma_destroy_qp(conn->cm_id);
> -                     rdma_destroy_id(conn->cm_id);
> +     do {
> +             if (in_callback) {
> +                     dapl_os_unlock(&conn->lock);
> +                     usleep(10);
> +                     dapl_os_lock(&conn->lock);
>               }
> -                     
> -             conn->cm_id = NULL;
> -             dapl_os_free(conn, sizeof(*conn));
> +             in_callback = conn->in_callback;
> +     } while (in_callback);
> +
> +     if (conn->ep)
> +             conn->ep->cm_handle = IB_INVALID_HANDLE;
> +     cm_id = conn->cm_id;
> +     conn->cm_id = NULL;
> +     dapl_os_unlock(&conn->lock);
> +     if (cm_id) {
> +             if (cm_id->qp)
> +                     rdma_destroy_qp(cm_id);
> +             rdma_destroy_id(cm_id);
>       }
> +     dapl_os_free(conn, sizeof(*conn));
>  }

This is an improvement. If dat_ep_free() returns success, DAPL 
consumers expect to be able to free the free'd EP's 
resources (PZ, etc.). We must have been getting lucky on IB.

I'm worried that a callback could occur ...

  void dapli_destroy_conn(struct dapl_cm_id *conn)
  {
          int in_callback;
          struct rdma_cm_id *cm_id;
  
          dapl_dbg_log(DAPL_DBG_TYPE_CM,
                       " destroy_conn: conn %p id %d\n",
                       conn,conn->cm_id);

          dapl_os_lock(&conn->lock);
          conn->destroy = 1;
          in_callback = conn->in_callback;
          do {
                  if (in_callback) {
                          dapl_os_unlock(&conn->lock);
                          usleep(10);
                          dapl_os_lock(&conn->lock);
                  }
                  in_callback = conn->in_callback;
          } while (in_callback);
  
          if (conn->ep)
                  conn->ep->cm_handle = IB_INVALID_HANDLE;
          cm_id = conn->cm_id;
          conn->cm_id = NULL;
          dapl_os_unlock(&conn->lock);

/* ... here */

          if (cm_id) {
                  if (cm_id->qp)
                          rdma_destroy_qp(cm_id);
                  rdma_destroy_id(cm_id);
          }
          dapl_os_free(conn, sizeof(*conn));
  }

Destroying the cm_id while in a callback would be bad.

>  static struct dapl_cm_id * dapli_req_recv(struct dapl_cm_id *conn,
> @@ -243,11 +249,9 @@
>       return new_conn;
>  }
>  
> -static int dapli_cm_active_cb(struct dapl_cm_id *conn,
> +static void dapli_cm_active_cb(struct dapl_cm_id *conn,
>                             struct rdma_cm_event *event)
>  {
> -     int destroy;
> -
>       dapl_dbg_log(DAPL_DBG_TYPE_CM, 
>                    " active_cb: conn %p id %d event %d\n",
>                    conn, conn->cm_id, event->event );
> @@ -255,7 +259,7 @@
>       dapl_os_lock(&conn->lock);
>       if (conn->destroy) {
>               dapl_os_unlock(&conn->lock);
> -             return 0;
> +             return;
>       }
>       conn->in_callback = 1;
>       dapl_os_unlock(&conn->lock);
> @@ -303,16 +307,14 @@
>       }
>  
>       dapl_os_lock(&conn->lock);
> -     destroy = conn->destroy;
> -     conn->in_callback = conn->destroy;
> +     conn->in_callback = 0;
>       dapl_os_unlock(&conn->lock);
> -     return(destroy);
> +     return;
>  }
>  
> -static int dapli_cm_passive_cb(struct dapl_cm_id *conn,
> +static void dapli_cm_passive_cb(struct dapl_cm_id *conn,
>                              struct rdma_cm_event *event)
>  {
> -     int destroy;
>       struct dapl_cm_id *new_conn;
>  
>       dapl_dbg_log(DAPL_DBG_TYPE_CM, 
> @@ -322,7 +324,7 @@
>       dapl_os_lock(&conn->lock);
>       if (conn->destroy) {
>               dapl_os_unlock(&conn->lock);
> -             return 0;
> +             return;
>       }
>       conn->in_callback = 1;
>       dapl_os_unlock(&conn->lock);
> @@ -377,10 +379,9 @@
>       }
>  
>       dapl_os_lock(&conn->lock);
> -     destroy = conn->destroy;
> -     conn->in_callback = conn->destroy;
> +     conn->in_callback = 0;
>       dapl_os_unlock(&conn->lock);
> -     return(destroy);
> +     return;
>  }
>  
> 
> @@ -1021,7 +1022,6 @@
>       /* process one CM event, fairness */
>       if(!ret) {
>               struct dapl_cm_id *conn;
> -             int ret;
>                               
>               /* set proper conn from cm_id context*/
>               if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
> @@ -1059,24 +1059,9 @@
>               case RDMA_CM_EVENT_DISCONNECTED:
>                       /* passive or active */
>                       if (conn->sp) 
> -                             ret = dapli_cm_passive_cb(conn,event);
> +                             dapli_cm_passive_cb(conn,event);
>                       else 
> -                             ret = dapli_cm_active_cb(conn,event);
> -
> -                     /* destroy both qp and cm_id */
> -                     if (ret) {
> -                             dapl_dbg_log(DAPL_DBG_TYPE_CM, 
> -                                          " cma_cb: DESTROY conn %p" 
> -                                          " cm_id %p qp %p\n",
> -                                          conn, conn->cm_id, 
> -                                          conn->cm_id->qp);
> -     
> -                             if (conn->cm_id->qp)
> -                                     rdma_destroy_qp(conn->cm_id);
> -
> -                             rdma_destroy_id(conn->cm_id);
> -                             dapl_os_free(conn, sizeof(*conn));
> -                     }
> +                             dapli_cm_active_cb(conn,event);

If this code is removed, we'll need to update functions that set 
conn->destroy to 1 to destroy the cm_id.

What happens if a consumer attempts to free the EP from a callback? 
With this change (or any one that blocked a callback thread from 
attempting to free the EP), I believe we would deadlock. 

Is it possible to destroy CM IDs from within a callback? 
_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to