On Wed, 5 Nov 2008 17:23:07 -0800
[EMAIL PROTECTED] wrote:

> 
> Way back in:
> 
> http:// lists.openfabrics.org/pipermail/general/2008-May/050196.html
> 
> I described an IPoIB-related panic we were seeing on large 
> clusters. The signature was a backtrace like this:
> 
>       skb_over_panic
>       :ib_ipoib:ipoib_ib_handle_rx_wc
>       :ib_ipoib:ipoib_poll
>       net_rx_action
>       .....
> 
> The bug is difficult to reproduce, but we finally got a crashdump, 
> and the problem appears to be that stale skb pointers on the tx_ring 
> were left pointing to skbs that had been since reused, so that the 
> skb's data region was now unexpectedly short, etc. 
> 
> Recently LLNL reported something similar:
> 
> http:// lists.openfabrics.org/pipermail/general/2008-October/054824.html
> 
> A patch similar to the following seems to fix thing up. 
> 
> Ira, Al, if this looks OK, can you please sign off on it?

Yep, looks good.

Ira

> 
> Signed-off-by: Arthur Kepner <[EMAIL PROTECTED]>
> 
> --- 
> 
>  ipoib_cm.c |    5 +++++
>  ipoib_ib.c |    4 ++++
>  2 files changed, 9 insertions(+)
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c 
> b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> index 7b14c2c..8f8650b 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> @@ -200,6 +200,7 @@ static void ipoib_cm_free_rx_ring(struct net_device *dev,
>                       ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
>                                             rx_ring[i].mapping);
>                       dev_kfree_skb_any(rx_ring[i].skb);
> +                     rx_ring[i].skb = NULL;
>               }
>  
>       vfree(rx_ring);
> @@ -736,6 +737,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff 
> *skb, struct ipoib_cm_
>       if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
>               ++dev->stats.tx_errors;
>               dev_kfree_skb_any(skb);
> +             tx_req->skb = NULL;
>               return;
>       }
>  
> @@ -747,6 +749,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff 
> *skb, struct ipoib_cm_
>               ++dev->stats.tx_errors;
>               ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
>               dev_kfree_skb_any(skb);
> +             tx_req->skb = NULL;
>       } else {
>               dev->trans_start = jiffies;
>               ++tx->tx_head;
> @@ -785,6 +788,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct 
> ib_wc *wc)
>       dev->stats.tx_bytes += tx_req->skb->len;
>  
>       dev_kfree_skb_any(tx_req->skb);
> +     tx_req->skb = NULL;
>  
>       netif_tx_lock(dev);
>  
> @@ -1179,6 +1183,7 @@ timeout:
>               ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
>                                   DMA_TO_DEVICE);
>               dev_kfree_skb_any(tx_req->skb);
> +             tx_req->skb = NULL;
>               ++p->tx_tail;
>               netif_tx_lock_bh(p->dev);
>               if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
> b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> index 28eb6f0..f7e3497 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> @@ -383,6 +383,7 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, 
> struct ib_wc *wc)
>       dev->stats.tx_bytes += tx_req->skb->len;
>  
>       dev_kfree_skb_any(tx_req->skb);
> +     tx_req->skb = NULL;
>  
>       ++priv->tx_tail;
>       if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
> @@ -572,6 +573,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff 
> *skb,
>       if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
>               ++dev->stats.tx_errors;
>               dev_kfree_skb_any(skb);
> +             tx_req->skb = NULL;
>               return;
>       }
>  
> @@ -594,6 +596,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff 
> *skb,
>               --priv->tx_outstanding;
>               ipoib_dma_unmap_tx(priv->ca, tx_req);
>               dev_kfree_skb_any(skb);
> +             tx_req->skb = NULL;
>               if (netif_queue_stopped(dev))
>                       netif_wake_queue(dev);
>       } else {
> @@ -833,6 +836,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
>                                                       (ipoib_sendq_size - 1)];
>                               ipoib_dma_unmap_tx(priv->ca, tx_req);
>                               dev_kfree_skb_any(tx_req->skb);
> +                             tx_req->skb = NULL;
>                               ++priv->tx_tail;
>                               --priv->tx_outstanding;
>                       }
> 
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to