In this instance, the code path getting us in trouble is:

 ib_modify_qp() calls 
   mthca_modify_qp() calls
     mthca_MODIFY_QP() calls
       mthca_cmd_box() calls
         mthca_cmd_wait() calls
           wait_for_completion() calls
             schedule()

but there are numerous code paths that reach mthca_cmd_wait(). We may 
have made this mistake in other places.

Roland, is an OpenIB user allowed to hold a spin lock over a verbs 
call? 

On Tue, 21 Jun 2005, Itamar Rabenstein wrote:

itamar> >Jun 20 09:13:50 localhost kernel: dapl_cm_active_cb_handler 138 event 
= 9
itamar> >Jun 20 09:13:50 localhost kernel: dapl_evd_connection_callback 760 
event=16389
itamar> >Jun 20 09:13:50 localhost kernel: dapl_evd_connection_callback 798 ep 
= c0a17bf8
itamar> >Jun 20 09:13:50 localhost kernel: dapl_ib_disconnect_clean 579 
ep=c0a17bf8
itamar> >Jun 20 09:13:50 localhost kernel: dapl_ib_disconnect 538 ep=c0a17bf8 
flags=0
itamar> >Jun 20 09:13:50 localhost kernel: dapl_destroy_cm_id 63
itamar> >Jun 20 09:13:50 localhost kernel: dapl_modify_qp_state_to_error 300
itamar> >Jun 20 09:13:50 localhost kernel: 
drivers/infiniband/ulp/dat-provider/dapl_ep.c:1111: 
spin_lock(drivers/infiniband/ulp/dat-provider/dapl_ep.c:c0a17c08) already 
locked by drivers/infiniband/ulp/dat-provider/dapl_evd.c/759
itamar> >Jun 20 09:13:51 localhost kernel: dapl_modify_qp_state_to_error 305
itamar> >Jun 20 09:13:51 localhost kernel: dapl_evd_connection_callback 800 ep 
= c0a17bf8
itamar> >Jun 20 09:13:51 localhost kernel: 
drivers/infiniband/ulp/dat-provider/dapl_evd.c:802: 
spin_unlock(drivers/infiniband/ulp/dat-provider/dapl_ep.c:c0a17c08) not locked
itamar> >Jun 20 09:13:51 localhost kernel: dapl_cr_callback 434 event=16389
itamar> >Jun 20 09:13:51 localhost kernel: dapl_cr_callback 512 ep = c9821bf8
itamar> >Jun 20 09:13:51 localhost kernel: dapl_ib_disconnect_clean 579 
ep=c982m: DREQ rcvd
itamar> 
itamar> Hi Hal,
itamar> I think i have found the problem. It is not legal to call ib_modify_qp()
itamar> inside spin_lock. This function can sleep.
itamar> 
itamar> Please try the patch below and if it works please tell james to ci it.
itamar> 
itamar> fix locking problem in cm callback functions
itamar> 
itamar> Signed-off-by: Itamar Rabenstein <[EMAIL PROTECTED]>
itamar> 
itamar> Index: dapl_openib_util.h
itamar> ===================================================================
itamar> --- dapl_openib_util.h  (revision 2665)
itamar> +++ dapl_openib_util.h  (working copy)
itamar> @@ -125,7 +125,7 @@
itamar>  
itamar>  void dapl_ib_reinit_ep(struct dapl_ep *ep);
itamar>  
itamar> -void dapl_ib_disconnect_clean(struct dapl_ep *ep, boolean_t passive);
itamar> +void dapl_ib_disconnect_clean(struct dapl_ep *ep);
itamar>  
itamar>  u32 dapl_ib_get_async_event(struct ib_event *cause,
itamar>                             enum dat_event_number *async_event);
itamar> Index: dapl_cr.c
itamar> ===================================================================
itamar> --- dapl_cr.c   (revision 2665)
itamar> +++ dapl_cr.c   (working copy)
itamar> @@ -479,8 +479,7 @@
itamar>                         /* If someone pulled the plug on the EP or 
connection,
itamar>                          * just exit
itamar>                          */
itamar> -                       spin_unlock_irqrestore(&ep->common.lock, 
itamar> -                                              ep->common.flags);
itamar> +                       spin_unlock_irqrestore(&ep->common.lock, 
ep->common.flags);
itamar>                         status = DAT_SUCCESS;
itamar>                         /* Set evd = NULL so we don't generate an event 
below */
itamar>                         evd = NULL;
itamar> @@ -504,36 +503,23 @@
itamar>                         /* The disconnect has already occurred, we are 
now
itamar>                          * cleaned up and ready to exit
itamar>                          */
itamar> -                       spin_unlock_irqrestore(&ep->common.lock, 
itamar> -                                              ep->common.flags);
itamar> +                       spin_unlock_irqrestore(&ep->common.lock, 
ep->common.flags);
itamar>                         return;
itamar>                 }
itamar>                 ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> -               dapl_ib_disconnect_clean(ep, FALSE);
itamar>                 spin_unlock_irqrestore(&ep->common.lock, 
ep->common.flags);
itamar> +               dapl_ib_disconnect_clean(ep);
itamar>  
itamar>                 break;
itamar>         case DAT_CONNECTION_EVENT_NON_PEER_REJECTED:
itamar>         case DAT_CONNECTION_EVENT_PEER_REJECTED:
itamar>         case DAT_CONNECTION_EVENT_UNREACHABLE:
itamar> -               /*
itamar> -                * After posting an accept the requesting node has
itamar> -                * stopped talking.
itamar> -                */
itamar> +       case DAT_CONNECTION_EVENT_BROKEN:
itamar>                 spin_lock_irqsave(&ep->common.lock, ep->common.flags);
itamar>                 ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> -               dapl_ib_disconnect_clean(ep, FALSE);
itamar>                 spin_unlock_irqrestore(&ep->common.lock, 
ep->common.flags);
itamar> -
itamar> +               dapl_ib_disconnect_clean(ep);
itamar>                 break;
itamar> -       case DAT_CONNECTION_EVENT_BROKEN:
itamar> -               spin_lock_irqsave(&ep->common.lock, ep->common.flags);
itamar> -               ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> -               dapl_ib_disconnect_clean(ep, FALSE);
itamar> -               spin_unlock_irqrestore(&ep->common.lock,
itamar> -                                      ep->common.flags);
itamar> -
itamar> -               break;
itamar>         default:
itamar>                 evd = NULL;
itamar>                 dapl_os_assert(0);      /* shouldn't happen */
itamar> Index: dapl_evd.c
itamar> ===================================================================
itamar> --- dapl_evd.c  (revision 2665)
itamar> +++ dapl_evd.c  (working copy)
itamar> @@ -760,7 +760,6 @@
itamar>  
itamar>         switch (event) {
itamar>         case DAT_CONNECTION_EVENT_ESTABLISHED:
itamar> -       {
itamar>                 /* 
itamar>                  * If we don't have an EP at this point we are very 
screwed up 
itamar>                  */
itamar> @@ -784,65 +783,28 @@
itamar>                                       private_data_size);
itamar>                 }
itamar>                 spin_unlock_irqrestore(&ep->common.lock, 
ep->common.flags);
itamar> -
itamar>                 break;
itamar> -       }
itamar>         case DAT_CONNECTION_EVENT_DISCONNECTED:
itamar> -       {
itamar> -               /*
itamar> -                * EP is now fully disconnected; initiate any post 
processing
itamar> -                * to reset the underlying QP and get the EP ready for
itamar> -                * another connection
itamar> -                */
itamar>                 ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> -               dapl_ib_disconnect_clean(ep, TRUE);
itamar> -               spin_unlock_irqrestore(&ep->common.lock,
itamar> -                                      ep->common.flags);
itamar> -
itamar> +               
spin_unlock_irqrestore(&ep->common.lock,ep->common.flags);
itamar> +               dapl_ib_disconnect_clean(ep);
itamar>                 break;
itamar> -       }
itamar>         case DAT_CONNECTION_EVENT_PEER_REJECTED:
itamar> -       {
itamar> -               ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> -               dapl_ib_disconnect_clean(ep, TRUE);
itamar> -               spin_unlock_irqrestore(&ep->common.lock, 
ep->common.flags);
itamar> -
itamar> -               break;
itamar> -       }
itamar>         case DAT_CONNECTION_EVENT_UNREACHABLE:
itamar> -       {
itamar> -               ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> -               dapl_ib_disconnect_clean(ep, TRUE);
itamar> -               spin_unlock_irqrestore(&ep->common.lock, 
ep->common.flags);
itamar> -
itamar> -               break;
itamar> -       }
itamar>         case DAT_CONNECTION_EVENT_NON_PEER_REJECTED:
itamar> -       {
itamar> -               ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> -               dapl_ib_disconnect_clean(ep, TRUE);
itamar> -               spin_unlock_irqrestore(&ep->common.lock, 
ep->common.flags);
itamar> -
itamar> -               break;
itamar> -       }
itamar>         case DAT_CONNECTION_EVENT_BROKEN:
itamar> -       {
itamar>                 ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> -               dapl_ib_disconnect_clean(ep, FALSE);
itamar>                 spin_unlock_irqrestore(&ep->common.lock, 
ep->common.flags);
itamar> -
itamar> +               dapl_ib_disconnect_clean(ep);
itamar>                 break;
itamar> -       }
itamar>         case DAT_CONNECTION_REQUEST_EVENT:
itamar>         default:
itamar> -       {
itamar>                 spin_unlock_irqrestore(&ep->common.lock, 
ep->common.flags);
itamar>                 evd = NULL;
itamar>  
itamar>                 dapl_os_assert(0);      /* shouldn't happen */
itamar>                 break;
itamar>         }
itamar> -       }
itamar>  
itamar>         /*
itamar>          * Post the event
itamar> Index: dapl_openib_cm.c
itamar> ===================================================================
itamar> --- dapl_openib_cm.c    (revision 2665)
itamar> +++ dapl_openib_cm.c    (working copy)
itamar> @@ -562,13 +562,12 @@
itamar>   *      void
itamar>   *
itamar>   */
itamar> -void dapl_ib_disconnect_clean(struct dapl_ep *ep_ptr, boolean_t active)
itamar> +void dapl_ib_disconnect_clean(struct dapl_ep *ep_ptr)
itamar>  {
itamar>         int status;
itamar>  
itamar>         dapl_dbg_log(DAPL_DBG_TYPE_CM,
itamar> -                    "  >>> dapl_ib_disconnect_clean: EP: %p active 
%d\n",
itamar> -                    ep_ptr, active);
itamar> +                    "  >>> dapl_ib_disconnect_clean: EP: %p \n", 
ep_ptr);
itamar>  
itamar>         /*
itamar>          * Clean up outstanding connection state
itamar> -- 
itamar> Itamar
itamar> 
_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to