Use common CQ for all TX QPs: keep a per-device counter of outstanding TX WRs, and stop the interface when this counter reaches the send queue size, to avoid CQ overruns. This should help reduce the number of interrupts for bi-directional traffic (such as TCP).
This helps fix "driver is hogging interrupts" errors reported for IPoIB send side. See e.g. https://bugs.openfabrics.org/show_bug.cgi?id=508 Signed-off-by: Michael S. Tsirkin <[EMAIL PROTECTED]> --- Index: linux-2.6/drivers/infiniband/ulp/ipoib/ipoib.h =================================================================== --- linux-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib.h 2007-08-16 15:23:30.000000000 +0300 +++ linux-2.6/drivers/infiniband/ulp/ipoib/ipoib.h 2007-08-16 15:23:56.000000000 +0300 @@ -97,9 +97,9 @@ enum { #define IPOIB_OP_RECV (1ul << 31) #ifdef CONFIG_INFINIBAND_IPOIB_CM -#define IPOIB_CM_OP_SRQ (1ul << 30) +#define IPOIB_OP_CM (1ul << 30) #else -#define IPOIB_CM_OP_SRQ (0) +#define IPOIB_OP_CM (0) #endif /* structs */ @@ -176,7 +176,6 @@ struct ipoib_cm_rx { struct ipoib_cm_tx { struct ib_cm_id *id; - struct ib_cq *cq; struct ib_qp *qp; struct list_head list; struct net_device *dev; @@ -271,6 +270,7 @@ struct ipoib_dev_priv { unsigned tx_tail; struct ib_sge tx_sge; struct ib_send_wr tx_wr; + unsigned tx_outstanding; struct ib_wc ibwc[IPOIB_NUM_WC]; @@ -480,6 +480,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb, unsigned int mtu); void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc); +void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc); #else struct ipoib_cm_tx; @@ -568,6 +569,9 @@ static inline void ipoib_cm_handle_rx_wc { } +static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ +} #endif #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG Index: linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_cm.c =================================================================== --- linux-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib_cm.c 2007-08-16 15:23:30.000000000 +0300 +++ linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_cm.c 2007-08-16 15:23:56.000000000 +0300 @@ -87,7 +87,7 @@ static int ipoib_cm_post_receive(struct struct ib_recv_wr *bad_wr; int i, ret; - priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_SRQ; + priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; for (i = 0; i < IPOIB_CM_RX_SG; ++i) priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i]; @@ -401,7 +401,7 @@ static void skb_put_frags(struct sk_buff void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) { struct ipoib_dev_priv *priv = netdev_priv(dev); - unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ; + unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV); struct sk_buff *skb, *newskb; struct ipoib_cm_rx *p; unsigned long flags; @@ -412,7 +412,7 @@ void ipoib_cm_handle_rx_wc(struct net_de wr_id, wc->status); if (unlikely(wr_id >= ipoib_recvq_size)) { - if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) { + if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) { spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); ipoib_cm_start_rx_drain(priv); @@ -498,7 +498,7 @@ static inline int post_send(struct ipoib priv->tx_sge.addr = addr; priv->tx_sge.length = len; - priv->tx_wr.wr_id = wr_id; + priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); } @@ -549,20 +549,19 @@ void ipoib_cm_send(struct net_device *de dev->trans_start = jiffies; ++tx->tx_head; - if (tx->tx_head - tx->tx_tail == ipoib_sendq_size) { + if (++priv->tx_outstanding == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", tx->qp->qp_num); netif_stop_queue(dev); - set_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags); } } } -static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx, - struct ib_wc *wc) +void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) { struct ipoib_dev_priv *priv = netdev_priv(dev); - unsigned int wr_id = wc->wr_id; + struct ipoib_cm_tx *tx = wc->qp->qp_context; + unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; struct ipoib_tx_buf *tx_req; unsigned long flags; @@ -587,11 +586,10 @@ static void ipoib_cm_handle_tx_wc(struct spin_lock_irqsave(&priv->tx_lock, flags); ++tx->tx_tail; - if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) && - tx->tx_head - tx->tx_tail <= ipoib_sendq_size >> 1) { - clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags); + if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + netif_queue_stopped(dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) netif_wake_queue(dev); - } if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) { @@ -614,11 +612,6 @@ static void ipoib_cm_handle_tx_wc(struct tx->neigh = NULL; } - /* queue would be re-started anyway when TX is destroyed, - * but it makes sense to do it ASAP here. */ - if (test_and_clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) - netif_wake_queue(dev); - if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { list_move(&tx->list, &priv->cm.reap_list); queue_work(ipoib_workqueue, &priv->cm.reap_task); @@ -632,19 +625,6 @@ static void ipoib_cm_handle_tx_wc(struct spin_unlock_irqrestore(&priv->tx_lock, flags); } -static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr) -{ - struct ipoib_cm_tx *tx = tx_ptr; - int n, i; - - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - do { - n = ib_poll_cq(cq, IPOIB_NUM_WC, tx->ibwc); - for (i = 0; i < n; ++i) - ipoib_cm_handle_tx_wc(tx->dev, tx, tx->ibwc + i); - } while (n == IPOIB_NUM_WC); -} - int ipoib_cm_dev_open(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -807,17 +787,18 @@ static int ipoib_cm_rep_handler(struct i return 0; } -static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ib_cq *cq) +static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_init_attr attr = {}; attr.recv_cq = priv->cq; + attr.send_cq = priv->cq; attr.srq = priv->cm.srq; attr.cap.max_send_wr = ipoib_sendq_size; attr.cap.max_send_sge = 1; attr.sq_sig_type = IB_SIGNAL_ALL_WR; attr.qp_type = IB_QPT_RC; - attr.send_cq = cq; + attr.qp_context = tx; return ib_create_qp(priv->pd, &attr); } @@ -897,21 +878,7 @@ static int ipoib_cm_tx_init(struct ipoib goto err_tx; } - p->cq = ib_create_cq(priv->ca, ipoib_cm_tx_completion, NULL, p, - ipoib_sendq_size + 1, 0); - if (IS_ERR(p->cq)) { - ret = PTR_ERR(p->cq); - ipoib_warn(priv, "failed to allocate tx cq: %d\n", ret); - goto err_cq; - } - - ret = ib_req_notify_cq(p->cq, IB_CQ_NEXT_COMP); - if (ret) { - ipoib_warn(priv, "failed to request completion notification: %d\n", ret); - goto err_req_notify; - } - - p->qp = ipoib_cm_create_tx_qp(p->dev, p->cq); + p->qp = ipoib_cm_create_tx_qp(p->dev, p); if (IS_ERR(p->qp)) { ret = PTR_ERR(p->qp); ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret); @@ -948,12 +915,8 @@ err_modify: err_id: p->id = NULL; ib_destroy_qp(p->qp); -err_req_notify: err_qp: p->qp = NULL; - ib_destroy_cq(p->cq); -err_cq: - p->cq = NULL; err_tx: return ret; } @@ -962,6 +925,8 @@ static void ipoib_cm_tx_destroy(struct i { struct ipoib_dev_priv *priv = netdev_priv(p->dev); struct ipoib_tx_buf *tx_req; + unsigned long flags; + unsigned long begin; ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail); @@ -969,27 +934,40 @@ static void ipoib_cm_tx_destroy(struct i if (p->id) ib_destroy_cm_id(p->id); - if (p->qp) - ib_destroy_qp(p->qp); - - if (p->cq) - ib_destroy_cq(p->cq); - - if (test_bit(IPOIB_FLAG_NETIF_STOPPED, &p->flags)) - netif_wake_queue(p->dev); - if (p->tx_ring) { + /* Wait for all sends to complete */ + begin = jiffies; while ((int) p->tx_tail - (int) p->tx_head < 0) { - tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; - ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, - DMA_TO_DEVICE); - dev_kfree_skb_any(tx_req->skb); - ++p->tx_tail; + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "timing out; %d sends not completed\n", + p->tx_head - p->tx_tail); + goto timeout; + } + + msleep(1); } + } - kfree(p->tx_ring); +timeout: + + while ((int) p->tx_tail - (int) p->tx_head < 0) { + tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; + ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, + DMA_TO_DEVICE); + dev_kfree_skb_any(tx_req->skb); + ++p->tx_tail; + spin_lock_irqsave(&priv->tx_lock, flags); + if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + netif_queue_stopped(p->dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + netif_wake_queue(p->dev); + spin_unlock_irqrestore(&priv->tx_lock, flags); } + if (p->qp) + ib_destroy_qp(p->qp); + + kfree(p->tx_ring); kfree(p); } Index: linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_ib.c =================================================================== --- linux-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c 2007-08-16 15:23:30.000000000 +0300 +++ linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_ib.c 2007-08-16 15:23:56.000000000 +0300 @@ -267,11 +267,10 @@ static void ipoib_ib_handle_tx_wc(struct spin_lock_irqsave(&priv->tx_lock, flags); ++priv->tx_tail; - if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) && - priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) { - clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags); + if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + netif_queue_stopped(dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) netif_wake_queue(dev); - } spin_unlock_irqrestore(&priv->tx_lock, flags); if (wc->status != IB_WC_SUCCESS && @@ -300,16 +299,19 @@ int ipoib_poll(struct net_device *dev, i for (i = 0; i < n; ++i) { struct ib_wc *wc = priv->ibwc + i; - if (wc->wr_id & IPOIB_CM_OP_SRQ) { - ++done; - --max; - ipoib_cm_handle_rx_wc(dev, wc); - } else if (wc->wr_id & IPOIB_OP_RECV) { + if (wc->wr_id & IPOIB_OP_RECV) { ++done; --max; - ipoib_ib_handle_rx_wc(dev, wc); - } else - ipoib_ib_handle_tx_wc(dev, wc); + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(dev, wc); + else + ipoib_ib_handle_rx_wc(dev, wc); + } else { + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_tx_wc(dev, wc); + else + ipoib_ib_handle_tx_wc(dev, wc); + } } if (n != t) { @@ -406,10 +408,9 @@ void ipoib_send(struct net_device *dev, address->last_send = priv->tx_head; ++priv->tx_head; - if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) { + if (++priv->tx_outstanding == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); netif_stop_queue(dev); - set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags); } } } @@ -558,12 +559,18 @@ void ipoib_drain_cq(struct net_device *d do { n = ib_poll_cq(priv->cq, IPOIB_NUM_WC, priv->ibwc); for (i = 0; i < n; ++i) { - if (priv->ibwc[i].wr_id & IPOIB_CM_OP_SRQ) - ipoib_cm_handle_rx_wc(dev, priv->ibwc + i); - else if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) - ipoib_ib_handle_rx_wc(dev, priv->ibwc + i); - else - ipoib_ib_handle_tx_wc(dev, priv->ibwc + i); + struct ib_wc *wc = priv->ibwc + i; + if (wc->wr_id & IPOIB_OP_RECV) { + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(dev, wc); + else + ipoib_ib_handle_rx_wc(dev, wc); + } else { + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_tx_wc(dev, wc); + else + ipoib_ib_handle_tx_wc(dev, wc); + } } } while (n == IPOIB_NUM_WC); } @@ -610,6 +617,7 @@ int ipoib_ib_dev_stop(struct net_device DMA_TO_DEVICE); dev_kfree_skb_any(tx_req->skb); ++priv->tx_tail; + --priv->tx_outstanding; } for (i = 0; i < ipoib_recvq_size; ++i) { Index: linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_main.c =================================================================== --- linux-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c 2007-08-16 15:23:30.000000000 +0300 +++ linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_main.c 2007-08-16 15:23:56.000000000 +0300 @@ -896,7 +896,7 @@ int ipoib_dev_init(struct net_device *de goto out_rx_ring_cleanup; } - /* priv->tx_head & tx_tail are already 0 */ + /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ if (ipoib_ib_dev_init(dev, ca, port)) goto out_tx_ring_cleanup; -- MST _______________________________________________ general mailing list [email protected] http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
