On Thu, 2008-11-06 at 08:04 -0800, [EMAIL PROTECTED] wrote:
> On Thu, Nov 06, 2008 at 10:40:32AM +0200, Eli Cohen wrote:
> > On Wed, Nov 05, 2008 at 05:23:07PM -0800, [EMAIL PROTECTED] wrote:
> > ...
> > looking a the patch I don't understand why it should fix the problem
> > you're seeing. I suspect we may be hiding the problem.
> >
>
> I think that may be correct.
>
> For the stale skb pointers to be reused by the ipoib driver, it
> looks like we'd need to get 'unexpected' completions.
I implemented the attached cheapo-debug-patch and installed it on one of
our clusters. We hit the error condition (the "Oh crap" error message)
several times right before the same crashes. So I think Arthur's patch
fixes something, although there may be a more core underlying issue yet
to be solved.
Al
P.S. I should note that when debugging this, I was looking at a
different stack trace than Arthur and Ira, but believed it to be the
same core issue.
--
Albert Chu
[EMAIL PROTECTED]
Computer Scientist
High Performance Systems Division
Lawrence Livermore National Laboratory
Index: linux/drivers/infiniband/ulp/ipoib/ipoib_cm.c
===================================================================
--- linux/drivers/infiniband/ulp/ipoib/ipoib_cm.c (revision 1677)
+++ linux/drivers/infiniband/ulp/ipoib/ipoib_cm.c (working copy)
@@ -715,10 +715,12 @@
*/
tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
tx_req->skb = skb;
+ tx_req->skb_should_be_null = 0;
addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
++priv->stats.tx_errors;
dev_kfree_skb_any(skb);
+ tx_req->skb_should_be_null = 1;
return;
}
@@ -730,6 +732,7 @@
++priv->stats.tx_errors;
ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
dev_kfree_skb_any(skb);
+ tx_req->skb_should_be_null = 1;
} else {
dev->trans_start = jiffies;
++tx->tx_head;
@@ -768,6 +771,7 @@
priv->stats.tx_bytes += tx_req->skb->len;
dev_kfree_skb_any(tx_req->skb);
+ tx_req->skb_should_be_null = 1;
spin_lock_irqsave(&priv->tx_lock, flags);
++tx->tx_tail;
@@ -1157,6 +1161,7 @@
ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
DMA_TO_DEVICE);
dev_kfree_skb_any(tx_req->skb);
+ tx_req->skb_should_be_null = 1;
++p->tx_tail;
spin_lock_irqsave(&priv->tx_lock, flags);
if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
Index: linux/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- linux/drivers/infiniband/ulp/ipoib/ipoib.h (revision 1675)
+++ linux/drivers/infiniband/ulp/ipoib/ipoib.h (working copy)
@@ -116,6 +116,7 @@
struct ipoib_cm_tx_buf {
struct sk_buff *skb;
u64 mapping;
+ unsigned int skb_should_be_null;
};
struct ipoib_header {
@@ -157,6 +158,7 @@
struct ipoib_tx_buf {
struct sk_buff *skb;
u64 mapping[MAX_SKB_FRAGS + 1];
+ unsigned int skb_should_be_null;
};
static inline int ipoib_dma_map_tx(struct ib_device *ca,
Index: linux/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- linux/drivers/infiniband/ulp/ipoib/ipoib_ib.c (revision 1677)
+++ linux/drivers/infiniband/ulp/ipoib/ipoib_ib.c (working copy)
@@ -331,10 +331,13 @@
tx_req = &priv->tx_ring[wr_id];
if (tx_req->skb) {
+ if (tx_req->skb_should_be_null)
+ printk("ACHU: Oh crap, skb should be NULL but it's %p\n", tx_req->skb);
ipoib_dma_unmap_tx(priv->ca, tx_req);
++priv->stats.tx_packets;
priv->stats.tx_bytes += tx_req->skb->len;
dev_kfree_skb_any(tx_req->skb);
+ tx_req->skb_should_be_null = 1;
}
++priv->tx_tail;
if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
@@ -592,9 +595,11 @@
*/
tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
tx_req->skb = skb;
+ tx_req->skb_should_be_null = 0;
if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
++priv->stats.tx_errors;
dev_kfree_skb_any(skb);
+ tx_req->skb_should_be_null = 1;
return;
}
@@ -611,6 +616,7 @@
++priv->stats.tx_errors;
ipoib_dma_unmap_tx(priv->ca, tx_req);
dev_kfree_skb_any(skb);
+ tx_req->skb_should_be_null = 1;
} else {
dev->trans_start = jiffies;
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general