On Fri, 2005-05-06 at 12:52, Hal Rosenstock wrote: 
> On Wed, 2005-05-04 at 02:57, Michael S. Tsirkin wrote: 
> > > If this only occurs on machines which are running OpenSM, it could be a
> > > problem with the local MAD handling where ib_free_recv_mad is not called
> > > for some case where it should. I will look into this if this is the
> > > case.
> > > 
> > > -- Hal
> > > 
> > 
> > This is the case for me: the error only  occurs on machines which are
> > running OpenSM.
> 
> So assuming mthca sets the bits properly for local MAD handling (and I
> have no reason to think otherwise), this is likely a local MAD handling
> error in mad.c where some path does not return an allocation to the MAD
> cache. I inspected the code to try and find it but couldn't. I will need
> to dig deeper. This will take some more time. (I will add some
> accounting in and see what this shows up). Please bear with me.

This patch works for me. Can you try it out ? If it works for you, I
will check it in.

-- Hal 
Index: mad.c
===================================================================
-- mad.c        (revision 2331)
+++ mad.c       (working copy)
@@ -1602,7 +1602,6 @@
 
        INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
        list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
-       
        if (mad_agent_priv->agent.rmpp_version) {
                mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
                                                      mad_recv_wc);
@@ -2054,6 +2053,8 @@
 
        /* Empty wait list to prevent receives from finding a request */
        list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
+       /* Empty local completion list as well */
+       list_splice_init(&mad_agent_priv->local_list, &cancel_list);
        spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
        /* Report all cancelled requests */
@@ -2168,6 +2169,7 @@
        struct ib_mad_local_private *local;
        struct ib_mad_agent_private *recv_mad_agent;
        unsigned long flags;
+       int recv = 0;
        struct ib_wc wc;
        struct ib_mad_send_wc mad_send_wc;
 
@@ -2183,10 +2185,10 @@
                        recv_mad_agent = local->recv_mad_agent;
                        if (!recv_mad_agent) {
                                printk(KERN_ERR PFX "No receive MAD agent for 
local completion\n");
-                               kmem_cache_free(ib_mad_cache, local->mad_priv);
                                goto local_send_completion;
                        }
 
+                       recv = 1;
                        /*
                         * Defined behavior is to complete response
                         * before request
@@ -2199,6 +2201,8 @@
                        local->mad_priv->header.recv_wc.mad_len =
                                                sizeof(struct ib_mad);
                        
INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list);
+                       list_add(&local->mad_priv->header.recv_wc.recv_buf.list,
+                                &local->mad_priv->header.recv_wc.rmpp_list);
                        local->mad_priv->header.recv_wc.recv_buf.grh = NULL;
                        local->mad_priv->header.recv_wc.recv_buf.mad =
                                                &local->mad_priv->mad.mad;
@@ -2229,6 +2233,8 @@
                spin_lock_irqsave(&mad_agent_priv->lock, flags);
                list_del(&local->completion_list);
                atomic_dec(&mad_agent_priv->refcount);
+               if (!recv)
+                       kmem_cache_free(ib_mad_cache, local->mad_priv);
                kfree(local);
        }
        spin_unlock_irqrestore(&mad_agent_priv->lock, flags);



_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to