[ofa-general] [PATCH] IPoIB: keep ib_wc[] on stack

akepner Tue, 13 May 2008 18:24:08 -0700

We're getting panics like this one on big clusters:

skb_over_panic: text:ffffffff8821f32e len:160 put:100 head:ffff810372b0f000 
data:ffff810372b0f01c tail:ffff810372b0f0bc end:ffff810372b0f080 dev:ib0
----------- [cut here ] --------- [please bite here ] ---------
Kernel BUG at net/core/skbuff.c:94
invalid opcode: 0000 [1] SMP
last sysfs file: /class/infiniband/mlx4_0/node_type
CPU 0
Modules linked in: worm sg sd_mod crc32c libcrc32c rdma_ucm rdma_cm iw_cm 
ib_addr ib_uverbs ib_umad iw_cxgb3 cxgb3 firmware_class mlx4_ibib_mthca 
iscsi_tcp libiscsi scsi_transport_iscsi ib_ipoib ib_cm ib_sa ib_mad ib_core 
ipv6 loop numatools xpmem shpchp pci_hotplug i2c_i801 i2c_core mlx4_core libata 
scsi_mod nfs lockd nfs_acl af_packet sunrpc e1000
Pid: 0, comm: swapper Tainted: G     U 2.6.16.46-0.12-smp #1
RIP: 0010:[<ffffffff8027a830>] <ffffffff8027a830>{skb_over_panic+77}
RSP: 0018:ffffffff80417e28  EFLAGS: 00010292
RAX: 0000000000000098 RBX: ffff81041b4bee08 RCX: 0000000000000292
RDX: ffffffff80347868 RSI: 0000000000000292 RDI: ffffffff80347860
RBP: ffff8103725817c0 R08: ffffffff80347868 R09: ffff81041d94e3c0
R10: 0000000000000000 R11: 0000000000000000 R12: ffff81041b4be500
R13: 0000000000000060 R14: 0000000000000900 R15: ffffc20000078908
FS:  0000000000000000(0000) GS:ffffffff803be000(0000) knlGS:0000000000000000
CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 00002b44089dc000 CR3: 000000041f35d000 CR4: 00000000000006e0
Process swapper (pid: 0, threadinfo ffffffff803d8000, task ffffffff80341340)
Stack: ffff810372b0f0bc ffff810372b0f080 ffff81041b4be000 ffff81041b4be500
       0000000000000060 ffffffff8821f336 ffffffff80417ec8 ffff81041b4be000
       0000000417227014 0000000000000292
Call Trace: <IRQ> <ffffffff8821f336>{:ib_ipoib:ipoib_ib_handle_rx_wc+909}
       <ffffffff882205a2>{:ib_ipoib:ipoib_poll+159} 
<ffffffff802811a5>{net_rx_action+165}
       <ffffffff8013775d>{__do_softirq+85} <ffffffff8010c11e>{call_softirq+30}
       <ffffffff8010d07c>{do_softirq+44} <ffffffff8010d435>{do_IRQ+64}
       <ffffffff80109e3a>{mwait_idle+0} <ffffffff8010b25a>{ret_from_intr+0} 
<EOI>
       <ffffffff80109e3a>{mwait_idle+0} <ffffffff80109e70>{mwait_idle+54}
       <ffffffff80109e17>{cpu_idle+151} <ffffffff803da7ec>{start_kernel+601}
       <ffffffff803da28a>{_sinittext+650}



Started looking into what might cause this and I found that IPoIB 
always does something like this:

int ipoib_poll(struct net_device *dev, int *budget) 
{ 
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        ....
        ib_poll_cq(priv->rcq, t, priv->ibwc);

        for (i = 0; i < n; i++) {
                struct ib_wc *wc = priv->ibwc + i;
                ....
                ipoib_ib_handle_rx_wc(dev, wc);
                

What happens if we call ib_poll_cq() then, before processing the 
rx completions in ipoib_ib_handle_rx_wc(), ipoib_poll() gets called 
again (on a different CPU)? That could corrupt the priv->ibwc array, 
and lead to a panic like above. 

How about keeping the array of struct ib_wc on the stack? 

This has been tested only on a small system, not yet on one large
enough to verify that it prevents the panic. But this "obviously" 
needs to be fixed, no?

Signed-off-by: Arthur Kepner <[EMAIL PROTECTED]>

---
 ipoib.h    |    3 ---
 ipoib_ib.c |   31 +++++++++++++++++--------------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff -rup a/drivers/infiniband/ulp/ipoib/ipoib.h 
b/drivers/infiniband/ulp/ipoib/ipoib.h
--- a/drivers/infiniband/ulp/ipoib/ipoib.h      2008-05-12 16:39:22.024109931 
-0700
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h      2008-05-13 16:21:52.433988977 
-0700
@@ -326,7 +326,6 @@ struct ipoib_cm_dev_priv {
        struct sk_buff_head     skb_queue;
        struct list_head        start_list;
        struct list_head        reap_list;
-       struct ib_wc            ibwc[IPOIB_NUM_WC];
        struct ib_sge           rx_sge[IPOIB_CM_RX_SG];
        struct ib_recv_wr       rx_wr;
        int                     nonsrq_conn_qp;
@@ -406,8 +405,6 @@ struct ipoib_dev_priv {
        struct ib_send_wr    tx_wr;
        unsigned             tx_outstanding;
 
-       struct ib_wc         ibwc[IPOIB_NUM_WC];
-       struct ib_wc         send_wc[MAX_SEND_CQE];
        unsigned int         tx_poll;
 
        struct list_head dead_ahs;
diff -rup a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c   2008-05-12 16:39:22.020109690 
-0700
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c   2008-05-13 17:19:28.809819954 
-0700
@@ -366,12 +366,13 @@ static void ipoib_ib_handle_tx_wc(struct
 
 void poll_tx(struct ipoib_dev_priv *priv)
 {
+       struct ib_wc send_wc[MAX_SEND_CQE];
        int n, i;
 
        while (1) {
-               n = ib_poll_cq(priv->scq, MAX_SEND_CQE, priv->send_wc);
+               n = ib_poll_cq(priv->scq, MAX_SEND_CQE, send_wc);
                for (i = 0; i < n; ++i)
-                       ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i);
+                       ipoib_ib_handle_tx_wc(priv->dev, send_wc + i);
 
                if (n < MAX_SEND_CQE)
                        break;
@@ -380,6 +381,7 @@ void poll_tx(struct ipoib_dev_priv *priv
 
 int ipoib_poll(struct net_device *dev, int *budget)
 {
+       struct ib_wc ibwc[IPOIB_NUM_WC];
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        int max = min(*budget, dev->quota);
        int done;
@@ -393,10 +395,10 @@ poll_more:
        while (max) {
 
                t = min(IPOIB_NUM_WC, max);
-               n = ib_poll_cq(priv->rcq, t, priv->ibwc);
+               n = ib_poll_cq(priv->rcq, t, ibwc);
 
                for (i = 0; i < n; i++) {
-                       struct ib_wc *wc = priv->ibwc + i;
+                       struct ib_wc *wc = ibwc + i;
 
                        if (wc->wr_id & IPOIB_OP_RECV) {
                                ++done;
@@ -783,29 +785,30 @@ static int recvs_pending(struct net_devi
 
 void ipoib_drain_cq(struct net_device *dev)
 {
+       struct ib_wc ibwc[IPOIB_NUM_WC];
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        int i, n;
        do {
-               n = ib_poll_cq(priv->rcq, IPOIB_NUM_WC, priv->ibwc);
+               n = ib_poll_cq(priv->rcq, IPOIB_NUM_WC, ibwc);
                for (i = 0; i < n; ++i) {
                        /*
                         * Convert any successful completions to flush
                         * errors to avoid passing packets up the
                         * stack after bringing the device down.
                         */
-                       if (priv->ibwc[i].status == IB_WC_SUCCESS)
-                               priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
+                       if (ibwc[i].status == IB_WC_SUCCESS)
+                               ibwc[i].status = IB_WC_WR_FLUSH_ERR;
 
-                       if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) {
-                               if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
-                                       ipoib_cm_handle_rx_wc(dev, priv->ibwc + 
i);
+                       if (ibwc[i].wr_id & IPOIB_OP_RECV) {
+                               if (ibwc[i].wr_id & IPOIB_OP_CM)
+                                       ipoib_cm_handle_rx_wc(dev, ibwc + i);
                                else
-                                       ipoib_ib_handle_rx_wc(dev, priv->ibwc + 
i);
+                                       ipoib_ib_handle_rx_wc(dev, ibwc + i);
                        } else {
-                               if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
-                                       ipoib_cm_handle_tx_wc(dev, priv->ibwc + 
i);
+                               if (ibwc[i].wr_id & IPOIB_OP_CM)
+                                       ipoib_cm_handle_tx_wc(dev, ibwc + i);
                                else
-                                       ipoib_ib_handle_tx_wc(dev, priv->ibwc + 
i);
+                                       ipoib_ib_handle_tx_wc(dev, ibwc + i);
                        }
                }
        } while (n == IPOIB_NUM_WC);


_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCH] IPoIB: keep ib_wc[] on stack

Reply via email to