On Thu, 2008-11-06 at 08:04 -0800, [EMAIL PROTECTED] wrote:
> On Thu, Nov 06, 2008 at 10:40:32AM +0200, Eli Cohen wrote:
> > On Wed, Nov 05, 2008 at 05:23:07PM -0800, [EMAIL PROTECTED] wrote:
> > ...
> > looking a the patch I don't understand why it should fix the problem
> > you're seeing. I suspect we may be hiding the problem.
> > 
> 
> I think that may be correct. 
>
> For the stale skb pointers to be reused by the ipoib driver, it 
> looks like we'd need to get 'unexpected' completions. 

I implemented the attached cheapo-debug-patch and installed it on one of
our clusters.  We hit the error condition (the "Oh crap" error message)
several times right before the same crashes.  So I think Arthur's patch
fixes something, although there may be a more core underlying issue yet
to be solved.

Al

P.S.  I should note that when debugging this, I was looking at a
different stack trace than Arthur and Ira, but believed it to be the
same core issue.

-- 
Albert Chu
[EMAIL PROTECTED]
Computer Scientist
High Performance Systems Division
Lawrence Livermore National Laboratory
Index: linux/drivers/infiniband/ulp/ipoib/ipoib_cm.c
===================================================================
--- linux/drivers/infiniband/ulp/ipoib/ipoib_cm.c	(revision 1677)
+++ linux/drivers/infiniband/ulp/ipoib/ipoib_cm.c	(working copy)
@@ -715,10 +715,12 @@
 	 */
 	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
 	tx_req->skb = skb;
+	tx_req->skb_should_be_null = 0;
 	addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
 	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
 		++priv->stats.tx_errors;
 		dev_kfree_skb_any(skb);
+		tx_req->skb_should_be_null = 1;
 		return;
 	}
 
@@ -730,6 +732,7 @@
 		++priv->stats.tx_errors;
 		ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
 		dev_kfree_skb_any(skb);
+		tx_req->skb_should_be_null = 1;
 	} else {
 		dev->trans_start = jiffies;
 		++tx->tx_head;
@@ -768,6 +771,7 @@
 	priv->stats.tx_bytes += tx_req->skb->len;
 
 	dev_kfree_skb_any(tx_req->skb);
+	tx_req->skb_should_be_null = 1;
 
 	spin_lock_irqsave(&priv->tx_lock, flags);
 	++tx->tx_tail;
@@ -1157,6 +1161,7 @@
 		ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
 				    DMA_TO_DEVICE);
 		dev_kfree_skb_any(tx_req->skb);
+		tx_req->skb_should_be_null = 1;
 		++p->tx_tail;
 		spin_lock_irqsave(&priv->tx_lock, flags);
 		if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
Index: linux/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- linux/drivers/infiniband/ulp/ipoib/ipoib.h	(revision 1675)
+++ linux/drivers/infiniband/ulp/ipoib/ipoib.h	(working copy)
@@ -116,6 +116,7 @@
 struct ipoib_cm_tx_buf {
 	struct sk_buff *skb;
 	u64		mapping;
+	unsigned int skb_should_be_null;
 };
 
 struct ipoib_header {
@@ -157,6 +158,7 @@
 struct ipoib_tx_buf {
 	struct sk_buff *skb;
 	u64		mapping[MAX_SKB_FRAGS + 1];
+	unsigned int skb_should_be_null;
 };
 
 static inline int ipoib_dma_map_tx(struct ib_device *ca,
Index: linux/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- linux/drivers/infiniband/ulp/ipoib/ipoib_ib.c	(revision 1677)
+++ linux/drivers/infiniband/ulp/ipoib/ipoib_ib.c	(working copy)
@@ -331,10 +331,13 @@
 	tx_req = &priv->tx_ring[wr_id];
 
 	if (tx_req->skb) {
+		if (tx_req->skb_should_be_null)
+			printk("ACHU: Oh crap, skb should be NULL but it's %p\n", tx_req->skb);
 		ipoib_dma_unmap_tx(priv->ca, tx_req);
 		++priv->stats.tx_packets;
 		priv->stats.tx_bytes += tx_req->skb->len;
 		dev_kfree_skb_any(tx_req->skb);
+		tx_req->skb_should_be_null = 1;
 	}
 	++priv->tx_tail;
 	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
@@ -592,9 +595,11 @@
 	 */
 	tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
 	tx_req->skb = skb;
+	tx_req->skb_should_be_null = 0;
 	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
 		++priv->stats.tx_errors;
 		dev_kfree_skb_any(skb);
+		tx_req->skb_should_be_null = 1;
 		return;
 	}
 
@@ -611,6 +616,7 @@
 		++priv->stats.tx_errors;
 		ipoib_dma_unmap_tx(priv->ca, tx_req);
 		dev_kfree_skb_any(skb);
+		tx_req->skb_should_be_null = 1;
 	} else {
 		dev->trans_start = jiffies;
 
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to