Quoting r. Michael S. Tsirkin <[EMAIL PROTECTED]>:
> Subject: ipoib oops
> 
> Hi!
> I saw this in /var/log/messages recently.
> Unfortunately I cant say exactly what I did to trigger this problem.

The following is for review only: this triggers another problem in
ipoib which I'm working on now.
Comments?

---

Make sure all users of priv->mcast_list and priv_broadcast are protected
by priv->lock. I had to add another list_head to the mcast structure
to avoid using mcast_list outside the lock in stop_thread.

Signed-off-by: Michael S. Tsirkin <[EMAIL PROTECTED]>

Index: linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
===================================================================
--- linux-2.6.14.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c    
2005-11-15 15:53:09.000000000 +0200
+++ linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2005-11-15 
17:31:35.000000000 +0200
@@ -62,6 +62,7 @@ struct ipoib_mcast {
 
        struct rb_node    rb_node;
        struct list_head  list;
+       struct list_head  cancel_list;
        struct completion done;
 
        int                 query_id;
@@ -587,10 +588,12 @@ int ipoib_mcast_start_thread(struct net_
        return 0;
 }
 
-int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
+int ipoib_mcast_stop_thread(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
-       struct ipoib_mcast *mcast;
+       struct ipoib_mcast *mcast, *tmcast;
+       unsigned long flags;
+       LIST_HEAD(cancel_list);
 
        ipoib_dbg_mcast(priv, "stopping multicast thread\n");
 
@@ -599,17 +602,20 @@ int ipoib_mcast_stop_thread(struct net_d
        cancel_delayed_work(&priv->mcast_task);
        up(&mcast_mutex);
 
-       if (flush)
-               flush_workqueue(ipoib_workqueue);
+       flush_workqueue(ipoib_workqueue);
 
-       if (priv->broadcast && priv->broadcast->query) {
-               ib_sa_cancel_query(priv->broadcast->query_id, 
priv->broadcast->query);
-               priv->broadcast->query = NULL;
-               ipoib_dbg_mcast(priv, "waiting for bcast\n");
-               wait_for_completion(&priv->broadcast->done);
-       }
+       spin_lock_irqsave(&priv->lock, flags);
+
+       if (priv->broadcast && priv->broadcast->query)
+               list_add_tail(&priv->broadcast->cancel_list, &cancel_list);
+
+       list_for_each_entry(mcast, &priv->multicast_list, list)
+               if (mcast->query)
+                       list_add_tail(&mcast->cancel_list, &cancel_list);
 
-       list_for_each_entry(mcast, &priv->multicast_list, list) {
+       spin_unlock_irqrestore(&priv->lock, flags);
+
+       list_for_each_entry_safe(mcast, tmcast, &cancel_list, cancel_list) {
                if (mcast->query) {
                        ib_sa_cancel_query(mcast->query_id, mcast->query);
                        mcast->query = NULL;
@@ -617,6 +623,7 @@ int ipoib_mcast_stop_thread(struct net_d
                                        IPOIB_GID_ARG(mcast->mcmember.mgid));
                        wait_for_completion(&mcast->done);
                }
+               list_del_init(&mcast->cancel_list);
        }
 
        return 0;
@@ -741,12 +748,14 @@ void ipoib_mcast_dev_flush(struct net_de
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        LIST_HEAD(remove_list);
+       LIST_HEAD(cancel_list);
        struct ipoib_mcast *mcast, *tmcast, *nmcast;
        unsigned long flags;
 
        ipoib_dbg_mcast(priv, "flushing multicast list\n");
 
        spin_lock_irqsave(&priv->lock, flags);
+
        list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
                nmcast = ipoib_mcast_alloc(dev, 0);
                if (nmcast) {
@@ -780,14 +789,24 @@ void ipoib_mcast_dev_flush(struct net_de
                                        &priv->multicast_tree);
 
                        list_add_tail(&priv->broadcast->list, &remove_list);
+                       priv->broadcast = nmcast;
+               } else {
+                       ipoib_warn(priv, "could not reallocate multicast group "
+                                  IPOIB_GID_FMT "\n",
+                                  
IPOIB_GID_ARG(priv->broadcast->mcmember.mgid));
                }
-
-               priv->broadcast = nmcast;
        }
 
        spin_unlock_irqrestore(&priv->lock, flags);
 
        list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+               if (mcast->query) {
+                       ib_sa_cancel_query(mcast->query_id, mcast->query);
+                       mcast->query = NULL;
+                       ipoib_dbg_mcast(priv, "waiting for MGID " IPOIB_GID_FMT 
"\n",
+                                       IPOIB_GID_ARG(mcast->mcmember.mgid));
+                       wait_for_completion(&mcast->done);
+               }
                ipoib_mcast_leave(dev, mcast);
                ipoib_mcast_free(mcast);
        }
@@ -821,8 +840,12 @@ void ipoib_mcast_restart_task(void *dev_
        unsigned long flags;
 
        ipoib_dbg_mcast(priv, "restarting multicast task\n");
+       ipoib_dbg_mcast(priv, "stopping multicast thread\n");
 
-       ipoib_mcast_stop_thread(dev, 0);
+       down(&mcast_mutex);
+       clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+       cancel_delayed_work(&priv->mcast_task);
+       up(&mcast_mutex);
 
        spin_lock_irqsave(&priv->lock, flags);
 
@@ -900,6 +923,13 @@ void ipoib_mcast_restart_task(void *dev_
 
        /* We have to cancel outside of the spinlock */
        list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+               if (mcast->query) {
+                       ib_sa_cancel_query(mcast->query_id, mcast->query);
+                       mcast->query = NULL;
+                       ipoib_dbg_mcast(priv, "waiting for MGID " IPOIB_GID_FMT 
"\n",
+                                       IPOIB_GID_ARG(mcast->mcmember.mgid));
+                       wait_for_completion(&mcast->done);
+               }
                ipoib_mcast_leave(mcast->dev, mcast);
                ipoib_mcast_free(mcast);
        }
Index: linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- linux-2.6.14.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c   2005-11-15 
15:53:09.000000000 +0200
+++ linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib_ib.c        2005-11-15 
17:20:47.000000000 +0200
@@ -452,7 +452,7 @@ int ipoib_ib_dev_down(struct net_device 
                flush_workqueue(ipoib_workqueue);
        }
 
-       ipoib_mcast_stop_thread(dev, 1);
+       ipoib_mcast_stop_thread(dev);
 
        /*
         * Flush the multicast groups first so we stop any multicast joins. The
@@ -619,7 +619,7 @@ void ipoib_ib_dev_cleanup(struct net_dev
 
        ipoib_dbg(priv, "cleaning up ib_dev\n");
 
-       ipoib_mcast_stop_thread(dev, 1);
+       ipoib_mcast_stop_thread(dev);
 
        /* Delete the broadcast address and the local address */
        ipoib_mcast_dev_down(dev);
Index: linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- linux-2.6.14.orig/drivers/infiniband/ulp/ipoib/ipoib.h      2005-11-14 
17:22:18.000000000 +0200
+++ linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib.h   2005-11-15 
17:02:04.000000000 +0200
@@ -264,7 +264,7 @@ void ipoib_mcast_send(struct net_device 
 
 void ipoib_mcast_restart_task(void *dev_ptr);
 int ipoib_mcast_start_thread(struct net_device *dev);
-int ipoib_mcast_stop_thread(struct net_device *dev, int flush);
+int ipoib_mcast_stop_thread(struct net_device *dev);
 
 void ipoib_mcast_dev_down(struct net_device *dev);
 void ipoib_mcast_dev_flush(struct net_device *dev);

-- 
MST
_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to