Gitweb:     
http://git.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=1b524963fd2d7fb20ea68df497151aa9d17fbca4
Commit:     1b524963fd2d7fb20ea68df497151aa9d17fbca4
Parent:     cbfb50e6e2e9c580848c0f51d37c24cdfb1cb704
Author:     Michael S. Tsirkin <[EMAIL PROTECTED]>
AuthorDate: Thu Aug 16 15:36:16 2007 +0300
Committer:  Roland Dreier <[EMAIL PROTECTED]>
CommitDate: Fri Oct 19 21:39:34 2007 -0700

    IPoIB/cm: Use common CQ for CM send completions
    
    Use the same CQ for CM send completions as for all other IPoIB
    completions.  This means all completions are processed via the same
    NAPI polling routine.  This should help reduce the number of
    interrupts for bi-directional traffic (such as TCP) and fixes "driver
    is hogging interrupts" errors reported for IPoIB send side, e.g.
    <https://bugs.openfabrics.org/show_bug.cgi?id=508>
    
    To do this, keep a per-interface counter of outstanding send WRs, and
    stop the interface when this counter reaches the send queue size to
    avoid CQ overruns.
    
    Signed-off-by: Michael S. Tsirkin <[EMAIL PROTECTED]>
    Signed-off-by: Roland Dreier <[EMAIL PROTECTED]>
---
 drivers/infiniband/ulp/ipoib/ipoib.h      |   15 +++--
 drivers/infiniband/ulp/ipoib/ipoib_cm.c   |  112 ++++++++++++-----------------
 drivers/infiniband/ulp/ipoib/ipoib_ib.c   |   46 +++++++-----
 drivers/infiniband/ulp/ipoib/ipoib_main.c |    4 +-
 4 files changed, 82 insertions(+), 95 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
b/drivers/infiniband/ulp/ipoib/ipoib.h
index 6545fa7..0a00ea0 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -84,9 +84,8 @@ enum {
        IPOIB_MCAST_RUN           = 6,
        IPOIB_STOP_REAPER         = 7,
        IPOIB_MCAST_STARTED       = 8,
-       IPOIB_FLAG_NETIF_STOPPED  = 9,
-       IPOIB_FLAG_ADMIN_CM       = 10,
-       IPOIB_FLAG_UMCAST         = 11,
+       IPOIB_FLAG_ADMIN_CM       = 9,
+       IPOIB_FLAG_UMCAST         = 10,
 
        IPOIB_MAX_BACKOFF_SECONDS = 16,
 
@@ -98,9 +97,9 @@ enum {
 
 #define        IPOIB_OP_RECV   (1ul << 31)
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
-#define        IPOIB_CM_OP_SRQ (1ul << 30)
+#define        IPOIB_OP_CM     (1ul << 30)
 #else
-#define        IPOIB_CM_OP_SRQ (0)
+#define        IPOIB_OP_CM     (0)
 #endif
 
 /* structs */
@@ -197,7 +196,6 @@ struct ipoib_cm_rx {
 
 struct ipoib_cm_tx {
        struct ib_cm_id     *id;
-       struct ib_cq        *cq;
        struct ib_qp        *qp;
        struct list_head     list;
        struct net_device   *dev;
@@ -294,6 +292,7 @@ struct ipoib_dev_priv {
        unsigned             tx_tail;
        struct ib_sge        tx_sge;
        struct ib_send_wr    tx_wr;
+       unsigned             tx_outstanding;
 
        struct ib_wc ibwc[IPOIB_NUM_WC];
 
@@ -502,6 +501,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
 void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
                           unsigned int mtu);
 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
+void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc);
 #else
 
 struct ipoib_cm_tx;
@@ -590,6 +590,9 @@ static inline void ipoib_cm_handle_rx_wc(struct net_device 
*dev, struct ib_wc *w
 {
 }
 
+static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc 
*wc)
+{
+}
 #endif
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c 
b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index ddf0c54..8761077 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -87,7 +87,7 @@ static int ipoib_cm_post_receive(struct net_device *dev, int 
id)
        struct ib_recv_wr *bad_wr;
        int i, ret;
 
-       priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_SRQ;
+       priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
 
        for (i = 0; i < IPOIB_CM_RX_SG; ++i)
                priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
@@ -401,7 +401,7 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int 
hdr_space,
 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
-       unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ;
+       unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
        struct sk_buff *skb, *newskb;
        struct ipoib_cm_rx *p;
        unsigned long flags;
@@ -412,7 +412,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct 
ib_wc *wc)
                       wr_id, wc->status);
 
        if (unlikely(wr_id >= ipoib_recvq_size)) {
-               if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) {
+               if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | 
IPOIB_OP_RECV))) {
                        spin_lock_irqsave(&priv->lock, flags);
                        list_splice_init(&priv->cm.rx_drain_list, 
&priv->cm.rx_reap_list);
                        ipoib_cm_start_rx_drain(priv);
@@ -498,7 +498,7 @@ static inline int post_send(struct ipoib_dev_priv *priv,
        priv->tx_sge.addr             = addr;
        priv->tx_sge.length           = len;
 
-       priv->tx_wr.wr_id             = wr_id;
+       priv->tx_wr.wr_id             = wr_id | IPOIB_OP_CM;
 
        return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
 }
@@ -549,20 +549,19 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff 
*skb, struct ipoib_cm_
                dev->trans_start = jiffies;
                ++tx->tx_head;
 
-               if (tx->tx_head - tx->tx_tail == ipoib_sendq_size) {
+               if (++priv->tx_outstanding == ipoib_sendq_size) {
                        ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net 
queue\n",
                                  tx->qp->qp_num);
                        netif_stop_queue(dev);
-                       set_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags);
                }
        }
 }
 
-static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx 
*tx,
-                                 struct ib_wc *wc)
+void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
-       unsigned int wr_id = wc->wr_id;
+       struct ipoib_cm_tx *tx = wc->qp->qp_context;
+       unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
        struct ipoib_tx_buf *tx_req;
        unsigned long flags;
 
@@ -587,11 +586,10 @@ static void ipoib_cm_handle_tx_wc(struct net_device *dev, 
struct ipoib_cm_tx *tx
 
        spin_lock_irqsave(&priv->tx_lock, flags);
        ++tx->tx_tail;
-       if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) &&
-           tx->tx_head - tx->tx_tail <= ipoib_sendq_size >> 1) {
-               clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags);
+       if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+           netif_queue_stopped(dev) &&
+           test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
                netif_wake_queue(dev);
-       }
 
        if (wc->status != IB_WC_SUCCESS &&
            wc->status != IB_WC_WR_FLUSH_ERR) {
@@ -614,11 +612,6 @@ static void ipoib_cm_handle_tx_wc(struct net_device *dev, 
struct ipoib_cm_tx *tx
                        tx->neigh = NULL;
                }
 
-               /* queue would be re-started anyway when TX is destroyed,
-                * but it makes sense to do it ASAP here. */
-               if (test_and_clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags))
-                       netif_wake_queue(dev);
-
                if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
                        list_move(&tx->list, &priv->cm.reap_list);
                        queue_work(ipoib_workqueue, &priv->cm.reap_task);
@@ -632,19 +625,6 @@ static void ipoib_cm_handle_tx_wc(struct net_device *dev, 
struct ipoib_cm_tx *tx
        spin_unlock_irqrestore(&priv->tx_lock, flags);
 }
 
-static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr)
-{
-       struct ipoib_cm_tx *tx = tx_ptr;
-       int n, i;
-
-       ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-       do {
-               n = ib_poll_cq(cq, IPOIB_NUM_WC, tx->ibwc);
-               for (i = 0; i < n; ++i)
-                       ipoib_cm_handle_tx_wc(tx->dev, tx, tx->ibwc + i);
-       } while (n == IPOIB_NUM_WC);
-}
-
 int ipoib_cm_dev_open(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -807,17 +787,18 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, 
struct ib_cm_event *even
        return 0;
 }
 
-static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct 
ib_cq *cq)
+static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct 
ipoib_cm_tx *tx)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        struct ib_qp_init_attr attr = {
-               .send_cq                = cq,
+               .send_cq                = priv->cq,
                .recv_cq                = priv->cq,
                .srq                    = priv->cm.srq,
                .cap.max_send_wr        = ipoib_sendq_size,
                .cap.max_send_sge       = 1,
                .sq_sig_type            = IB_SIGNAL_ALL_WR,
                .qp_type                = IB_QPT_RC,
+               .qp_context             = tx
         };
 
        return ib_create_qp(priv->pd, &attr);
@@ -899,21 +880,7 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
                goto err_tx;
        }
 
-       p->cq = ib_create_cq(priv->ca, ipoib_cm_tx_completion, NULL, p,
-                            ipoib_sendq_size + 1, 0);
-       if (IS_ERR(p->cq)) {
-               ret = PTR_ERR(p->cq);
-               ipoib_warn(priv, "failed to allocate tx cq: %d\n", ret);
-               goto err_cq;
-       }
-
-       ret = ib_req_notify_cq(p->cq, IB_CQ_NEXT_COMP);
-       if (ret) {
-               ipoib_warn(priv, "failed to request completion notification: 
%d\n", ret);
-               goto err_req_notify;
-       }
-
-       p->qp = ipoib_cm_create_tx_qp(p->dev, p->cq);
+       p->qp = ipoib_cm_create_tx_qp(p->dev, p);
        if (IS_ERR(p->qp)) {
                ret = PTR_ERR(p->qp);
                ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
@@ -950,12 +917,8 @@ err_modify:
 err_id:
        p->id = NULL;
        ib_destroy_qp(p->qp);
-err_req_notify:
 err_qp:
        p->qp = NULL;
-       ib_destroy_cq(p->cq);
-err_cq:
-       p->cq = NULL;
 err_tx:
        return ret;
 }
@@ -964,6 +927,8 @@ static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
 {
        struct ipoib_dev_priv *priv = netdev_priv(p->dev);
        struct ipoib_tx_buf *tx_req;
+       unsigned long flags;
+       unsigned long begin;
 
        ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
                  p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
@@ -971,27 +936,40 @@ static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
        if (p->id)
                ib_destroy_cm_id(p->id);
 
-       if (p->qp)
-               ib_destroy_qp(p->qp);
-
-       if (p->cq)
-               ib_destroy_cq(p->cq);
-
-       if (test_bit(IPOIB_FLAG_NETIF_STOPPED, &p->flags))
-               netif_wake_queue(p->dev);
-
        if (p->tx_ring) {
+               /* Wait for all sends to complete */
+               begin = jiffies;
                while ((int) p->tx_tail - (int) p->tx_head < 0) {
-                       tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 
1)];
-                       ib_dma_unmap_single(priv->ca, tx_req->mapping, 
tx_req->skb->len,
-                                        DMA_TO_DEVICE);
-                       dev_kfree_skb_any(tx_req->skb);
-                       ++p->tx_tail;
+                       if (time_after(jiffies, begin + 5 * HZ)) {
+                               ipoib_warn(priv, "timing out; %d sends not 
completed\n",
+                                          p->tx_head - p->tx_tail);
+                               goto timeout;
+                       }
+
+                       msleep(1);
                }
+       }
 
-               kfree(p->tx_ring);
+timeout:
+
+       while ((int) p->tx_tail - (int) p->tx_head < 0) {
+               tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
+               ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
+                                   DMA_TO_DEVICE);
+               dev_kfree_skb_any(tx_req->skb);
+               ++p->tx_tail;
+               spin_lock_irqsave(&priv->tx_lock, flags);
+               if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+                   netif_queue_stopped(p->dev) &&
+                   test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+                       netif_wake_queue(p->dev);
+               spin_unlock_irqrestore(&priv->tx_lock, flags);
        }
 
+       if (p->qp)
+               ib_destroy_qp(p->qp);
+
+       kfree(p->tx_ring);
        kfree(p);
 }
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 6a5f9fc..5063dd5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -267,11 +267,10 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, 
struct ib_wc *wc)
 
        spin_lock_irqsave(&priv->tx_lock, flags);
        ++priv->tx_tail;
-       if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) &&
-           priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) {
-               clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
+       if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+           netif_queue_stopped(dev) &&
+           test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
                netif_wake_queue(dev);
-       }
        spin_unlock_irqrestore(&priv->tx_lock, flags);
 
        if (wc->status != IB_WC_SUCCESS &&
@@ -301,14 +300,18 @@ poll_more:
                for (i = 0; i < n; i++) {
                        struct ib_wc *wc = priv->ibwc + i;
 
-                       if (wc->wr_id & IPOIB_CM_OP_SRQ) {
-                               ++done;
-                               ipoib_cm_handle_rx_wc(dev, wc);
-                       } else if (wc->wr_id & IPOIB_OP_RECV) {
+                       if (wc->wr_id & IPOIB_OP_RECV) {
                                ++done;
-                               ipoib_ib_handle_rx_wc(dev, wc);
-                       } else
-                               ipoib_ib_handle_tx_wc(dev, wc);
+                               if (wc->wr_id & IPOIB_OP_CM)
+                                       ipoib_cm_handle_rx_wc(dev, wc);
+                               else
+                                       ipoib_ib_handle_rx_wc(dev, wc);
+                       } else {
+                               if (wc->wr_id & IPOIB_OP_CM)
+                                       ipoib_cm_handle_tx_wc(dev, wc);
+                               else
+                                       ipoib_ib_handle_tx_wc(dev, wc);
+                       }
                }
 
                if (n != t)
@@ -401,10 +404,9 @@ void ipoib_send(struct net_device *dev, struct sk_buff 
*skb,
                address->last_send = priv->tx_head;
                ++priv->tx_head;
 
-               if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) {
+               if (++priv->tx_outstanding == ipoib_sendq_size) {
                        ipoib_dbg(priv, "TX ring full, stopping kernel net 
queue\n");
                        netif_stop_queue(dev);
-                       set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
                }
        }
 }
@@ -563,12 +565,17 @@ void ipoib_drain_cq(struct net_device *dev)
                        if (priv->ibwc[i].status == IB_WC_SUCCESS)
                                priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
 
-                       if (priv->ibwc[i].wr_id & IPOIB_CM_OP_SRQ)
-                               ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
-                       else if (priv->ibwc[i].wr_id & IPOIB_OP_RECV)
-                               ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
-                       else
-                               ipoib_ib_handle_tx_wc(dev, priv->ibwc + i);
+                       if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) {
+                               if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
+                                       ipoib_cm_handle_rx_wc(dev, priv->ibwc + 
i);
+                               else
+                                       ipoib_ib_handle_rx_wc(dev, priv->ibwc + 
i);
+                       } else {
+                               if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
+                                       ipoib_cm_handle_tx_wc(dev, priv->ibwc + 
i);
+                               else
+                                       ipoib_ib_handle_tx_wc(dev, priv->ibwc + 
i);
+                       }
                }
        } while (n == IPOIB_NUM_WC);
 }
@@ -614,6 +621,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
                                                    DMA_TO_DEVICE);
                                dev_kfree_skb_any(tx_req->skb);
                                ++priv->tx_tail;
+                               --priv->tx_outstanding;
                        }
 
                        for (i = 0; i < ipoib_recvq_size; ++i) {
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index e072f3c..ace2345 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -148,8 +148,6 @@ static int ipoib_stop(struct net_device *dev)
 
        netif_stop_queue(dev);
 
-       clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
-
        /*
         * Now flush workqueue to make sure a scheduled task doesn't
         * bring our internal state back up.
@@ -895,7 +893,7 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device 
*ca, int port)
                goto out_rx_ring_cleanup;
        }
 
-       /* priv->tx_head & tx_tail are already 0 */
+       /* priv->tx_head, tx_tail & tx_outstanding are already 0 */
 
        if (ipoib_ib_dev_init(dev, ca, port))
                goto out_tx_ring_cleanup;
-
To unsubscribe from this list: send the line "unsubscribe git-commits-head" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to