Use common CQ for all TX QPs: keep a per-device counter of outstanding TX WRs,
and stop the interface when this counter reaches the send queue size, to avoid
CQ overruns.  This should help reduce the number of interrupts for
bi-directional traffic (such as TCP).

This helps fix "driver is hogging interrupts" errors reported for IPoIB send 
side.
See e.g. https://bugs.openfabrics.org/show_bug.cgi?id=508

Signed-off-by: Michael S. Tsirkin <[EMAIL PROTECTED]>

---

Index: linux-2.6/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- linux-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib.h 2007-08-16 
15:23:30.000000000 +0300
+++ linux-2.6/drivers/infiniband/ulp/ipoib/ipoib.h      2007-08-16 
15:23:56.000000000 +0300
@@ -97,9 +97,9 @@ enum {
 
 #define        IPOIB_OP_RECV   (1ul << 31)
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
-#define        IPOIB_CM_OP_SRQ (1ul << 30)
+#define        IPOIB_OP_CM     (1ul << 30)
 #else
-#define        IPOIB_CM_OP_SRQ (0)
+#define        IPOIB_OP_CM     (0)
 #endif
 
 /* structs */
@@ -176,7 +176,6 @@ struct ipoib_cm_rx {
 
 struct ipoib_cm_tx {
        struct ib_cm_id     *id;
-       struct ib_cq        *cq;
        struct ib_qp        *qp;
        struct list_head     list;
        struct net_device   *dev;
@@ -271,6 +270,7 @@ struct ipoib_dev_priv {
        unsigned             tx_tail;
        struct ib_sge        tx_sge;
        struct ib_send_wr    tx_wr;
+       unsigned             tx_outstanding;
 
        struct ib_wc ibwc[IPOIB_NUM_WC];
 
@@ -480,6 +480,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm
 void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
                           unsigned int mtu);
 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
+void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc);
 #else
 
 struct ipoib_cm_tx;
@@ -568,6 +569,9 @@ static inline void ipoib_cm_handle_rx_wc
 {
 }
 
+static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc 
*wc)
+{
+}
 #endif
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
Index: linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_cm.c
===================================================================
--- linux-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib_cm.c      2007-08-16 
15:23:30.000000000 +0300
+++ linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_cm.c   2007-08-16 
15:23:56.000000000 +0300
@@ -87,7 +87,7 @@ static int ipoib_cm_post_receive(struct 
        struct ib_recv_wr *bad_wr;
        int i, ret;
 
-       priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_SRQ;
+       priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
 
        for (i = 0; i < IPOIB_CM_RX_SG; ++i)
                priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
@@ -401,7 +401,7 @@ static void skb_put_frags(struct sk_buff
 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
-       unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ;
+       unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
        struct sk_buff *skb, *newskb;
        struct ipoib_cm_rx *p;
        unsigned long flags;
@@ -412,7 +412,7 @@ void ipoib_cm_handle_rx_wc(struct net_de
                       wr_id, wc->status);
 
        if (unlikely(wr_id >= ipoib_recvq_size)) {
-               if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) {
+               if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | 
IPOIB_OP_RECV))) {
                        spin_lock_irqsave(&priv->lock, flags);
                        list_splice_init(&priv->cm.rx_drain_list, 
&priv->cm.rx_reap_list);
                        ipoib_cm_start_rx_drain(priv);
@@ -498,7 +498,7 @@ static inline int post_send(struct ipoib
        priv->tx_sge.addr             = addr;
        priv->tx_sge.length           = len;
 
-       priv->tx_wr.wr_id             = wr_id;
+       priv->tx_wr.wr_id             = wr_id | IPOIB_OP_CM;
 
        return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
 }
@@ -549,20 +549,19 @@ void ipoib_cm_send(struct net_device *de
                dev->trans_start = jiffies;
                ++tx->tx_head;
 
-               if (tx->tx_head - tx->tx_tail == ipoib_sendq_size) {
+               if (++priv->tx_outstanding == ipoib_sendq_size) {
                        ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net 
queue\n",
                                  tx->qp->qp_num);
                        netif_stop_queue(dev);
-                       set_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags);
                }
        }
 }
 
-static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx 
*tx,
-                                 struct ib_wc *wc)
+void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
-       unsigned int wr_id = wc->wr_id;
+       struct ipoib_cm_tx *tx = wc->qp->qp_context;
+       unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
        struct ipoib_tx_buf *tx_req;
        unsigned long flags;
 
@@ -587,11 +586,10 @@ static void ipoib_cm_handle_tx_wc(struct
 
        spin_lock_irqsave(&priv->tx_lock, flags);
        ++tx->tx_tail;
-       if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) &&
-           tx->tx_head - tx->tx_tail <= ipoib_sendq_size >> 1) {
-               clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags);
+       if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+           netif_queue_stopped(dev) &&
+           test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
                netif_wake_queue(dev);
-       }
 
        if (wc->status != IB_WC_SUCCESS &&
            wc->status != IB_WC_WR_FLUSH_ERR) {
@@ -614,11 +612,6 @@ static void ipoib_cm_handle_tx_wc(struct
                        tx->neigh = NULL;
                }
 
-               /* queue would be re-started anyway when TX is destroyed,
-                * but it makes sense to do it ASAP here. */
-               if (test_and_clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags))
-                       netif_wake_queue(dev);
-
                if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
                        list_move(&tx->list, &priv->cm.reap_list);
                        queue_work(ipoib_workqueue, &priv->cm.reap_task);
@@ -632,19 +625,6 @@ static void ipoib_cm_handle_tx_wc(struct
        spin_unlock_irqrestore(&priv->tx_lock, flags);
 }
 
-static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr)
-{
-       struct ipoib_cm_tx *tx = tx_ptr;
-       int n, i;
-
-       ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-       do {
-               n = ib_poll_cq(cq, IPOIB_NUM_WC, tx->ibwc);
-               for (i = 0; i < n; ++i)
-                       ipoib_cm_handle_tx_wc(tx->dev, tx, tx->ibwc + i);
-       } while (n == IPOIB_NUM_WC);
-}
-
 int ipoib_cm_dev_open(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -807,17 +787,18 @@ static int ipoib_cm_rep_handler(struct i
        return 0;
 }
 
-static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct 
ib_cq *cq)
+static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct 
ipoib_cm_tx *tx)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        struct ib_qp_init_attr attr = {};
        attr.recv_cq = priv->cq;
+       attr.send_cq = priv->cq;
        attr.srq = priv->cm.srq;
        attr.cap.max_send_wr = ipoib_sendq_size;
        attr.cap.max_send_sge = 1;
        attr.sq_sig_type = IB_SIGNAL_ALL_WR;
        attr.qp_type = IB_QPT_RC;
-       attr.send_cq = cq;
+       attr.qp_context = tx;
        return ib_create_qp(priv->pd, &attr);
 }
 
@@ -897,21 +878,7 @@ static int ipoib_cm_tx_init(struct ipoib
                goto err_tx;
        }
 
-       p->cq = ib_create_cq(priv->ca, ipoib_cm_tx_completion, NULL, p,
-                            ipoib_sendq_size + 1, 0);
-       if (IS_ERR(p->cq)) {
-               ret = PTR_ERR(p->cq);
-               ipoib_warn(priv, "failed to allocate tx cq: %d\n", ret);
-               goto err_cq;
-       }
-
-       ret = ib_req_notify_cq(p->cq, IB_CQ_NEXT_COMP);
-       if (ret) {
-               ipoib_warn(priv, "failed to request completion notification: 
%d\n", ret);
-               goto err_req_notify;
-       }
-
-       p->qp = ipoib_cm_create_tx_qp(p->dev, p->cq);
+       p->qp = ipoib_cm_create_tx_qp(p->dev, p);
        if (IS_ERR(p->qp)) {
                ret = PTR_ERR(p->qp);
                ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
@@ -948,12 +915,8 @@ err_modify:
 err_id:
        p->id = NULL;
        ib_destroy_qp(p->qp);
-err_req_notify:
 err_qp:
        p->qp = NULL;
-       ib_destroy_cq(p->cq);
-err_cq:
-       p->cq = NULL;
 err_tx:
        return ret;
 }
@@ -962,6 +925,8 @@ static void ipoib_cm_tx_destroy(struct i
 {
        struct ipoib_dev_priv *priv = netdev_priv(p->dev);
        struct ipoib_tx_buf *tx_req;
+       unsigned long flags;
+       unsigned long begin;
 
        ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
                  p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
@@ -969,27 +934,40 @@ static void ipoib_cm_tx_destroy(struct i
        if (p->id)
                ib_destroy_cm_id(p->id);
 
-       if (p->qp)
-               ib_destroy_qp(p->qp);
-
-       if (p->cq)
-               ib_destroy_cq(p->cq);
-
-       if (test_bit(IPOIB_FLAG_NETIF_STOPPED, &p->flags))
-               netif_wake_queue(p->dev);
-
        if (p->tx_ring) {
+               /* Wait for all sends to complete */
+               begin = jiffies;
                while ((int) p->tx_tail - (int) p->tx_head < 0) {
-                       tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 
1)];
-                       ib_dma_unmap_single(priv->ca, tx_req->mapping, 
tx_req->skb->len,
-                                        DMA_TO_DEVICE);
-                       dev_kfree_skb_any(tx_req->skb);
-                       ++p->tx_tail;
+                       if (time_after(jiffies, begin + 5 * HZ)) {
+                               ipoib_warn(priv, "timing out; %d sends not 
completed\n",
+                                          p->tx_head - p->tx_tail);
+                               goto timeout;
+                       }
+
+                       msleep(1);
                }
+       }
 
-               kfree(p->tx_ring);
+timeout:
+
+       while ((int) p->tx_tail - (int) p->tx_head < 0) {
+               tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
+               ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
+                                   DMA_TO_DEVICE);
+               dev_kfree_skb_any(tx_req->skb);
+               ++p->tx_tail;
+               spin_lock_irqsave(&priv->tx_lock, flags);
+               if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+                   netif_queue_stopped(p->dev) &&
+                   test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+                       netif_wake_queue(p->dev);
+               spin_unlock_irqrestore(&priv->tx_lock, flags);
        }
 
+       if (p->qp)
+               ib_destroy_qp(p->qp);
+
+       kfree(p->tx_ring);
        kfree(p);
 }
 
Index: linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- linux-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c      2007-08-16 
15:23:30.000000000 +0300
+++ linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_ib.c   2007-08-16 
15:23:56.000000000 +0300
@@ -267,11 +267,10 @@ static void ipoib_ib_handle_tx_wc(struct
 
        spin_lock_irqsave(&priv->tx_lock, flags);
        ++priv->tx_tail;
-       if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) &&
-           priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) {
-               clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
+       if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+           netif_queue_stopped(dev) &&
+           test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
                netif_wake_queue(dev);
-       }
        spin_unlock_irqrestore(&priv->tx_lock, flags);
 
        if (wc->status != IB_WC_SUCCESS &&
@@ -300,16 +299,19 @@ int ipoib_poll(struct net_device *dev, i
                for (i = 0; i < n; ++i) {
                        struct ib_wc *wc = priv->ibwc + i;
 
-                       if (wc->wr_id & IPOIB_CM_OP_SRQ) {
-                               ++done;
-                               --max;
-                               ipoib_cm_handle_rx_wc(dev, wc);
-                       } else if (wc->wr_id & IPOIB_OP_RECV) {
+                       if (wc->wr_id & IPOIB_OP_RECV) {
                                ++done;
                                --max;
-                               ipoib_ib_handle_rx_wc(dev, wc);
-                       } else
-                               ipoib_ib_handle_tx_wc(dev, wc);
+                               if (wc->wr_id & IPOIB_OP_CM)
+                                       ipoib_cm_handle_rx_wc(dev, wc);
+                               else
+                                       ipoib_ib_handle_rx_wc(dev, wc);
+                       } else {
+                               if (wc->wr_id & IPOIB_OP_CM)
+                                       ipoib_cm_handle_tx_wc(dev, wc);
+                               else
+                                       ipoib_ib_handle_tx_wc(dev, wc);
+                       }
                }
 
                if (n != t) {
@@ -406,10 +408,9 @@ void ipoib_send(struct net_device *dev, 
                address->last_send = priv->tx_head;
                ++priv->tx_head;
 
-               if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) {
+               if (++priv->tx_outstanding == ipoib_sendq_size) {
                        ipoib_dbg(priv, "TX ring full, stopping kernel net 
queue\n");
                        netif_stop_queue(dev);
-                       set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
                }
        }
 }
@@ -558,12 +559,18 @@ void ipoib_drain_cq(struct net_device *d
        do {
                n = ib_poll_cq(priv->cq, IPOIB_NUM_WC, priv->ibwc);
                for (i = 0; i < n; ++i) {
-                       if (priv->ibwc[i].wr_id & IPOIB_CM_OP_SRQ)
-                               ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
-                       else if (priv->ibwc[i].wr_id & IPOIB_OP_RECV)
-                               ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
-                       else
-                               ipoib_ib_handle_tx_wc(dev, priv->ibwc + i);
+                       struct ib_wc *wc = priv->ibwc + i;
+                       if (wc->wr_id & IPOIB_OP_RECV) {
+                               if (wc->wr_id & IPOIB_OP_CM)
+                                       ipoib_cm_handle_rx_wc(dev, wc);
+                               else
+                                       ipoib_ib_handle_rx_wc(dev, wc);
+                       } else {
+                               if (wc->wr_id & IPOIB_OP_CM)
+                                       ipoib_cm_handle_tx_wc(dev, wc);
+                               else
+                                       ipoib_ib_handle_tx_wc(dev, wc);
+                       }
                }
        } while (n == IPOIB_NUM_WC);
 }
@@ -610,6 +617,7 @@ int ipoib_ib_dev_stop(struct net_device 
                                                    DMA_TO_DEVICE);
                                dev_kfree_skb_any(tx_req->skb);
                                ++priv->tx_tail;
+                               --priv->tx_outstanding;
                        }
 
                        for (i = 0; i < ipoib_recvq_size; ++i) {
Index: linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- linux-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c    2007-08-16 
15:23:30.000000000 +0300
+++ linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_main.c 2007-08-16 
15:23:56.000000000 +0300
@@ -896,7 +896,7 @@ int ipoib_dev_init(struct net_device *de
                goto out_rx_ring_cleanup;
        }
 
-       /* priv->tx_head & tx_tail are already 0 */
+       /* priv->tx_head, tx_tail & tx_outstanding are already 0 */
 
        if (ipoib_ib_dev_init(dev, ca, port))
                goto out_tx_ring_cleanup;

-- 
MST
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to