> When using the bonding driver, neighbours are created by the net stack on 
> behalf
> of the bonding (master) device. On the tx flow the bonding code gets an skb 
> such
> that skb->dev points to the master device, it changes this skb to point on the
> slave device and calls the slave hard_start_xmit function.
> 
> 
> Combing these two flows, there is a hole if some code at ipoib
> (ipoib_neigh_destructor) assumes that for each struct neighbour it gets, 
> n->dev
> is an ipoib device so for example netdev_priv(n->dev) would be of type struct
> ipoib_dev_priv.
> 
> To fix it, this patch adds a dev field to struct ipoib_neigh which is used
> instead of the struct neighbour dev one.

It seems that in this design, if multiple ipoib interfaces are present, we might
get an skb such that skb->dev will be different from the new dev field in struct
ipoib_neigh.

It seems that the result will be that the packet will be sent on a wrong 
interface.
Right?

> In addition, if an IPoIB device is removed before bonding is unloaded it may 
> cause bond0 neighbours (neighbours that point to bond0) to exist after the 
> IPoIB
> device no longer exist. This is why a neighbour cleanup is required during 
> device 
> cleanup. This cleanup scans the arp cache and the ndisc cache to find there 
> neighbours of bond0 which refer also to the relevant ibX. Also, when ib_ipoib 
> module is
> unloaded, the neighbour destructor must be set to NULL because the neighbour 
> function is in
> ib_ipoib.
> For this neigh table cleanup, it is required to export the symbol nd_tbl just 
> like the symbol arp_tbl is.

I wonder about this: is it really true that any allocated neighbour is always in
either arp_tbl or nd_tbl? For example, could some code have called neigh_hold
and retained a neighbour that is not in either one of these tables?

> During my tests I found that when running 
> 
>       1. modprobe -r ib_mthca (to delete IPoIB interfaces)
>       2. ping somewhere on the subnet of bond0
> 
> I get this stack dump (which ends with kernel death)
>        [<ffffffff8037ff32>] skb_under_panic+0x5c/0x60
>        [<ffffffff882e00c2>] :ib_ipoib:ipoib_hard_header+0xa6/0xc0
>        [<ffffffff803c3c98>] arp_create+0x120/0x226
>        [<ffffffff803c3dc3>] arp_send+0x25/0x3b
>        [<ffffffff803c466a>] arp_solicit+0x186/0x195
>        [<ffffffff8038c0ac>] neigh_timer_handler+0x2b5/0x309
>        [<ffffffff8038bdf7>] neigh_timer_handler+0x0/0x309
>        [<ffffffff80239599>] run_timer_softirq+0x130/0x19e
>        [<ffffffff80235fcc>] __do_softirq+0x55/0xc3
>        [<ffffffff8020acac>] call_softirq+0x1c/0x28
>        [<ffffffff8020c02b>] do_softirq+0x2c/0x7d
>        [<ffffffff8021864a>] smp_apic_timer_interrupt+0x57/0x6a
>        [<ffffffff80208e19>] mwait_idle+0x0/0x45
>        [<ffffffff8020a756>] apic_timer_interrupt+0x66/0x70
>        <EOI>  [<ffffffff80208e5b>] mwait_idle+0x42/0x45
>        [<ffffffff80208db1>] cpu_idle+0x8b/0xae
>        [<ffffffff80217d60>] start_secondary+0x47f/0x48f
> 
> The only way I found to avoid this (for now) is to check skb headroom in
> ipoib_hard_header. I guess that this safety check doesn't harm regular IPoIB 
> operation and it seems to solve my problem. However, I would be happy to hear 
> what
> others think of this last issue.

As I said, this seems to indicate a problem in the bonding code.
But what will happen after you error out in ipoib_hard_header?
Is the packet dropped? What might break as a result?

> I would really appreciate comments.
> 
> thanks
> 
>  -MoniS

------------------------------------------------------------------------------
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
b/drivers/infiniband/ulp/ipoib/ipoib.h
index 07deee8..31bc6d8 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -216,6 +216,7 @@ struct ipoib_neigh {
        struct sk_buff_head queue;
 
        struct neighbour   *neighbour;
+       struct net_device *dev;
 
        struct list_head    list;
 };
@@ -232,7 +233,8 @@ static inline struct ipoib_neigh **to_ip
                                     INFINIBAND_ALEN, sizeof(void *));
 }
 
-struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh);
+struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh,
+                                     struct net_device *dev);
 void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh);
 
 extern struct workqueue_struct *ipoib_workqueue;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 705eb1d..0e3953e 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -48,6 +48,8 @@ #include <linux/ip.h>
 #include <linux/in.h>
 
 #include <net/dst.h>
+#include <net/arp.h>
+#include <net/ndisc.h>
 
 #define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
 
@@ -70,6 +72,7 @@ module_param_named(debug_level, ipoib_de
 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
 #endif
 
+static int ipoib_at_exit = 0;
 struct ipoib_path_iter {
        struct net_device *dev;
        struct ipoib_path  path;
@@ -490,7 +493,7 @@ static void neigh_add_path(struct sk_buf
        struct ipoib_path *path;
        struct ipoib_neigh *neigh;
 
-       neigh = ipoib_neigh_alloc(skb->dst->neighbour);
+       neigh = ipoib_neigh_alloc(skb->dst->neighbour, skb->dev);
        if (!neigh) {
                ++priv->stats.tx_dropped;
                dev_kfree_skb_any(skb);
@@ -735,6 +738,9 @@ static int ipoib_hard_header(struct sk_b
 {
        struct ipoib_header *header;
 
+       if (skb_headroom(skb) < sizeof *header) {
+               return -1;
+       }
        header = (struct ipoib_header *) skb_push(skb, sizeof *header);
 
        header->proto = htons(type);
@@ -746,8 +752,11 @@ static int ipoib_hard_header(struct sk_b
         * figure out where to send the packet later.
         */
        if ((!skb->dst || !skb->dst->neighbour) && daddr) {
-               struct ipoib_pseudoheader *phdr =
-                       (struct ipoib_pseudoheader *) skb_push(skb, sizeof 
*phdr);
+               struct ipoib_pseudoheader *phdr = NULL;
+               if (skb_headroom(skb) < sizeof *phdr) {
+                       return -1;
+               }
+               phdr = (struct ipoib_pseudoheader *) skb_push(skb, sizeof 
*phdr);
                memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
        }
 
@@ -769,32 +778,69 @@ static void ipoib_set_mcast_list(struct 
 static void ipoib_neigh_destructor(struct neighbour *n)
 {
        struct ipoib_neigh *neigh;
-       struct ipoib_dev_priv *priv = netdev_priv(n->dev);
+       struct ipoib_dev_priv *priv;
        unsigned long flags;
        struct ipoib_ah *ah = NULL;
 
-       ipoib_dbg(priv,
-                 "neigh_destructor for %06x " IPOIB_GID_FMT "\n",
-                 IPOIB_QPN(n->ha),
-                 IPOIB_GID_RAW_ARG(n->ha + 4));
-
-       spin_lock_irqsave(&priv->lock, flags);
 
        neigh = *to_ipoib_neigh(n);
        if (neigh) {
+               priv = netdev_priv(neigh->dev);
+               ipoib_dbg(priv,
+                         "neigh_destructor for %06x " IPOIB_GID_FMT "\n",
+                         IPOIB_QPN(n->ha),
+                         IPOIB_GID_RAW_ARG(n->ha + 4));
+
+               spin_lock_irqsave(&priv->lock, flags);
                if (neigh->ah)
                        ah = neigh->ah;
                list_del(&neigh->list);
                ipoib_neigh_free(n->dev, neigh);
+               spin_unlock_irqrestore(&priv->lock, flags);
        }
-
-       spin_unlock_irqrestore(&priv->lock, flags);
-
        if (ah)
                ipoib_put_ah(ah);
 }
 
-struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour)
+static void ipoib_neigh_tbl_cleanup_master(struct neigh_table *tbl,
+                                          struct net_device* master,
+                                          struct net_device* slave)
+{
+       int i;
+       struct ipoib_neigh *neigh;
+
+       write_lock_bh(&tbl->lock);
+       for (i = 0; i <= tbl->hash_mask; i++) {
+               struct neighbour *n, **np;
+
+               np = &tbl->hash_buckets[i];
+               while ((n = *np) != NULL) {
+                       write_lock(&n->lock);
+                       if (n->dev == master) {
+                               neigh = *to_ipoib_neigh(n);
+                               if (neigh && (neigh->dev == slave)){
+                                       if (ipoib_at_exit)
+                                               n->parms->neigh_destructor = 
NULL;
+                                       ipoib_neigh_destructor(n);
+                               }
+                       }
+                       write_unlock(&n->lock);
+                       np = &n->next;
+               }
+       }
+       write_unlock_bh(&tbl->lock);
+}
+
+static void ipoib_neigh_cleanup_by_master(struct net_device* master,struct 
net_device* slave){
+       netif_stop_queue(slave);
+       if (master) {
+               ipoib_neigh_tbl_cleanup_master(&arp_tbl,master, slave);
+               ipoib_neigh_tbl_cleanup_master(&nd_tbl,master, slave);
+       }
+}
+
+struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour,
+                                     struct net_device *dev)
 {
        struct ipoib_neigh *neigh;
 
@@ -803,6 +849,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(st
                return NULL;
 
        neigh->neighbour = neighbour;
+       neigh->dev = dev;
        *to_ipoib_neigh(neighbour) = neigh;
        skb_queue_head_init(&neigh->queue);
 
@@ -874,6 +921,7 @@ void ipoib_dev_cleanup(struct net_device
 
        /* Delete any child interfaces first */
        list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
+               ipoib_neigh_cleanup_by_master(cpriv->dev->master, cpriv->dev);
                unregister_netdev(cpriv->dev);
                ipoib_dev_cleanup(cpriv->dev);
                free_netdev(cpriv->dev);
@@ -1159,6 +1207,7 @@ static void ipoib_remove_one(struct ib_d
                ib_unregister_event_handler(&priv->event_handler);
                flush_scheduled_work();
 
+               ipoib_neigh_cleanup_by_master(priv->dev->master, priv->dev);
                unregister_netdev(priv->dev);
                ipoib_dev_cleanup(priv->dev);
                free_netdev(priv->dev);
@@ -1217,6 +1266,8 @@ err_fs:
 
 static void __exit ipoib_cleanup_module(void)
 {
+       ipoib_at_exit = 1;
+
        ib_unregister_client(&ipoib_client);
        ib_sa_unregister_client(&ipoib_sa_client);
        ipoib_unregister_debugfs();
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 
b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index b04b72c..a41a949 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -774,7 +774,7 @@ out:
                if (skb->dst            &&
                    skb->dst->neighbour &&
                    !*to_ipoib_neigh(skb->dst->neighbour)) {
-                       struct ipoib_neigh *neigh = 
ipoib_neigh_alloc(skb->dst->neighbour);
+                       struct ipoib_neigh *neigh = 
ipoib_neigh_alloc(skb->dst->neighbour, skb->dev);
 
                        if (neigh) {
                                kref_get(&mcast->ah->ref);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 6a9f616..557be98 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -153,6 +153,7 @@ struct neigh_table nd_tbl = {
        .gc_thresh2 =    512,
        .gc_thresh3 =   1024,
 };
+EXPORT_SYMBOL(nd_tbl);
 
 /* ND options */
 struct ndisc_options {


-- 
MST

_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to