This patch was sent to linux-rdma a while ago but had not been accepted yet. However, no objection was raised so far.
Note: the patch below is not to driver/infiniband/ulp/ipoib but it generates a patch under kernel_patches/fixes. -- Index: ofa_kernel-1.5.3/kernel_patches/fixes/zzz_0041_add_mcast_gc.diff =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ ofa_kernel-1.5.3/kernel_patches/fixes/zzz_0041_add_mcast_gc.diff 2011-02-02 16:29:00.000000000 +0200 @@ -0,0 +1,206 @@ +The kernel never leaves send only multicast groups. In addition, IPoIB doesn't +implement real send only join but it sends the SM a send/receive join request. +In order to avoid MC group explosion on the switch, a mechanism of garbage +collection to unused multicast groups is required. + +Signed-off-by: Yossi Etigin <[email protected]> +Signed-off-by: Moni Shoua <[email protected]> +-- + +diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h +index ab97f92..fb1714f 100644 +--- a/drivers/infiniband/ulp/ipoib/ipoib.h ++++ b/drivers/infiniband/ulp/ipoib/ipoib.h +@@ -92,6 +92,7 @@ enum { + IPOIB_FLAG_ADMIN_CM = 9, + IPOIB_FLAG_UMCAST = 10, + IPOIB_FLAG_CSUM = 11, ++ IPOIB_MCAST_RUN_GC = 12, + + IPOIB_MAX_BACKOFF_SECONDS = 16, + +@@ -132,6 +133,7 @@ struct ipoib_mcast { + struct list_head list; + + unsigned long created; ++ unsigned long used; + unsigned long backoff; + + unsigned long flags; +@@ -283,7 +285,8 @@ struct ipoib_dev_priv { + struct rb_root multicast_tree; + + struct delayed_work pkey_poll_task; +- struct delayed_work mcast_task; ++ struct delayed_work mcast_join_task; ++ struct delayed_work mcast_leave_task; + struct work_struct carrier_on_task; + struct work_struct flush_light; + struct work_struct flush_normal; +@@ -411,6 +414,8 @@ void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh); + + extern struct workqueue_struct *ipoib_workqueue; + ++extern int ipoib_mc_sendonly_timeout; ++ + /* functions */ + + int ipoib_poll(struct napi_struct *napi, int budget); +@@ -453,6 +458,7 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); + void ipoib_dev_cleanup(struct net_device *dev); + + void ipoib_mcast_join_task(struct work_struct *work); ++void ipoib_mcast_leave_task(struct work_struct *work); + void ipoib_mcast_carrier_on_task(struct work_struct *work); + void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb); + +diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c +index 7a07a72..563370e 100644 +--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c +@@ -67,6 +67,11 @@ module_param_named(debug_level, ipoib_debug_level, int, 0644); + MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); + #endif + ++int ipoib_mc_sendonly_timeout; ++ ++module_param_named(mc_sendonly_timeout, ipoib_mc_sendonly_timeout, int, 0644); ++MODULE_PARM_DESC(mc_sendonly_timeout, "Enable debug tracing if > 0"); ++ + struct ipoib_path_iter { + struct net_device *dev; + struct ipoib_path path; +@@ -1020,7 +1025,8 @@ static void ipoib_setup(struct net_device *dev) + INIT_LIST_HEAD(&priv->multicast_list); + + INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); +- INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); ++ INIT_DELAYED_WORK(&priv->mcast_join_task, ipoib_mcast_join_task); ++ INIT_DELAYED_WORK(&priv->mcast_leave_task, ipoib_mcast_leave_task); + INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); + INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); + INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); +diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +index 3871ac6..87928c1 100644 +--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +@@ -117,6 +117,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, + + mcast->dev = dev; + mcast->created = jiffies; ++ mcast->used = jiffies; + mcast->backoff = 1; + + INIT_LIST_HEAD(&mcast->list); +@@ -403,7 +404,7 @@ static int ipoib_mcast_join_complete(int status, + mutex_lock(&mcast_mutex); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, +- &priv->mcast_task, 0); ++ &priv->mcast_join_task, 0); + mutex_unlock(&mcast_mutex); + + /* +@@ -436,7 +437,7 @@ static int ipoib_mcast_join_complete(int status, + mutex_lock(&mcast_mutex); + spin_lock_irq(&priv->lock); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) +- queue_delayed_work(ipoib_workqueue, &priv->mcast_task, ++ queue_delayed_work(ipoib_workqueue, &priv->mcast_join_task, + mcast->backoff * HZ); + spin_unlock_irq(&priv->lock); + mutex_unlock(&mcast_mutex); +@@ -505,7 +506,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, + mutex_lock(&mcast_mutex); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, +- &priv->mcast_task, ++ &priv->mcast_join_task, + mcast->backoff * HZ); + mutex_unlock(&mcast_mutex); + } +@@ -514,7 +515,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, + void ipoib_mcast_join_task(struct work_struct *work) + { + struct ipoib_dev_priv *priv = +- container_of(work, struct ipoib_dev_priv, mcast_task.work); ++ container_of(work, struct ipoib_dev_priv, mcast_join_task.work); + struct net_device *dev = priv->dev; + + if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) +@@ -546,7 +547,7 @@ void ipoib_mcast_join_task(struct work_struct *work) + mutex_lock(&mcast_mutex); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, +- &priv->mcast_task, HZ); ++ &priv->mcast_join_task, HZ); + mutex_unlock(&mcast_mutex); + return; + } +@@ -610,7 +611,9 @@ int ipoib_mcast_start_thread(struct net_device *dev) + + mutex_lock(&mcast_mutex); + if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) +- queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); ++ queue_delayed_work(ipoib_workqueue, &priv->mcast_join_task, 0); ++ if (!test_and_set_bit(IPOIB_MCAST_RUN_GC, &priv->flags)) ++ queue_delayed_work(ipoib_workqueue, &priv->mcast_leave_task, 0); + mutex_unlock(&mcast_mutex); + + return 0; +@@ -624,7 +627,9 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush) + + mutex_lock(&mcast_mutex); + clear_bit(IPOIB_MCAST_RUN, &priv->flags); +- cancel_delayed_work(&priv->mcast_task); ++ clear_bit(IPOIB_MCAST_RUN_GC, &priv->flags); ++ cancel_delayed_work(&priv->mcast_join_task); ++ cancel_delayed_work(&priv->mcast_leave_task); + mutex_unlock(&mcast_mutex); + + if (flush) +@@ -727,7 +732,7 @@ out: + list_add_tail(&neigh->list, &mcast->neigh_list); + } + } +- ++ mcast->used = jiffies; + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); + return; +@@ -888,6 +893,35 @@ void ipoib_mcast_restart_task(struct work_struct *work) + ipoib_mcast_start_thread(dev); + } + ++void ipoib_mcast_leave_task(struct work_struct *work) ++{ ++ struct ipoib_dev_priv *priv = ++ container_of(work, struct ipoib_dev_priv, mcast_leave_task.work); ++ struct net_device *dev = priv->dev; ++ struct ipoib_mcast *mcast, *tmcast; ++ LIST_HEAD(remove_list); ++ ++ if (!test_bit(IPOIB_MCAST_RUN_GC, &priv->flags)) ++ return; ++ ++ if (ipoib_mc_sendonly_timeout > 0) { ++ list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { ++ if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && ++ time_before(mcast->used, jiffies - ipoib_mc_sendonly_timeout * HZ)) { ++ rb_erase(&mcast->rb_node, &priv->multicast_tree); ++ list_move_tail(&mcast->list, &remove_list); ++ } ++ } ++ ++ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ++ ipoib_mcast_leave(dev, mcast); ++ ipoib_mcast_free(mcast); ++ } ++ } ++ ++ queue_delayed_work(ipoib_workqueue, &priv->mcast_leave_task, 60 * HZ); ++} ++ + #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + + struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev) _______________________________________________ ewg mailing list [email protected] http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
