Way back in:

http://lists.openfabrics.org/pipermail/general/2008-May/050196.html

I described an IPoIB-related panic we were seeing on large 
clusters. The signature was a backtrace like this:

        skb_over_panic
        :ib_ipoib:ipoib_ib_handle_rx_wc
        :ib_ipoib:ipoib_poll
        net_rx_action
        .....

The bug is difficult to reproduce, but we finally got a crashdump, 
and the problem appears to be that stale skb pointers on the tx_ring 
were left pointing to skbs that had been since reused, so that the 
skb's data region was now unexpectedly short, etc. 

Recently LLNL reported something similar:

http://lists.openfabrics.org/pipermail/general/2008-October/054824.html

A patch similar to the following seems to fix thing up. 

Ira, Al, if this looks OK, can you please sign off on it?

Signed-off-by: Arthur Kepner <[EMAIL PROTECTED]>

--- 

 ipoib_cm.c |    5 +++++
 ipoib_ib.c |    4 ++++
 2 files changed, 9 insertions(+)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c 
b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 7b14c2c..8f8650b 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -200,6 +200,7 @@ static void ipoib_cm_free_rx_ring(struct net_device *dev,
                        ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
                                              rx_ring[i].mapping);
                        dev_kfree_skb_any(rx_ring[i].skb);
+                       rx_ring[i].skb = NULL;
                }
 
        vfree(rx_ring);
@@ -736,6 +737,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff 
*skb, struct ipoib_cm_
        if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
                ++dev->stats.tx_errors;
                dev_kfree_skb_any(skb);
+               tx_req->skb = NULL;
                return;
        }
 
@@ -747,6 +749,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff 
*skb, struct ipoib_cm_
                ++dev->stats.tx_errors;
                ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
                dev_kfree_skb_any(skb);
+               tx_req->skb = NULL;
        } else {
                dev->trans_start = jiffies;
                ++tx->tx_head;
@@ -785,6 +788,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct 
ib_wc *wc)
        dev->stats.tx_bytes += tx_req->skb->len;
 
        dev_kfree_skb_any(tx_req->skb);
+       tx_req->skb = NULL;
 
        netif_tx_lock(dev);
 
@@ -1179,6 +1183,7 @@ timeout:
                ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
                                    DMA_TO_DEVICE);
                dev_kfree_skb_any(tx_req->skb);
+               tx_req->skb = NULL;
                ++p->tx_tail;
                netif_tx_lock_bh(p->dev);
                if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 28eb6f0..f7e3497 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -383,6 +383,7 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, 
struct ib_wc *wc)
        dev->stats.tx_bytes += tx_req->skb->len;
 
        dev_kfree_skb_any(tx_req->skb);
+       tx_req->skb = NULL;
 
        ++priv->tx_tail;
        if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
@@ -572,6 +573,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
        if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
                ++dev->stats.tx_errors;
                dev_kfree_skb_any(skb);
+               tx_req->skb = NULL;
                return;
        }
 
@@ -594,6 +596,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
                --priv->tx_outstanding;
                ipoib_dma_unmap_tx(priv->ca, tx_req);
                dev_kfree_skb_any(skb);
+               tx_req->skb = NULL;
                if (netif_queue_stopped(dev))
                        netif_wake_queue(dev);
        } else {
@@ -833,6 +836,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
                                                        (ipoib_sendq_size - 1)];
                                ipoib_dma_unmap_tx(priv->ca, tx_req);
                                dev_kfree_skb_any(tx_req->skb);
+                               tx_req->skb = NULL;
                                ++priv->tx_tail;
                                --priv->tx_outstanding;
                        }
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to