Hello Roland,

Here is the patch to tune IPoIB QP send/recv queue size. Please reivew. The attachement file is for you to apply the patch.
This patch includes:
        a. these two parameters are module parameters
        b. they are saved in priv_dev
        c. the QP size max value=4k, min value=64
        d. QP size check to be power of 2 because of tx_ring
        e. these QP sizes are logged in /var/log/messages.
        f. modify TX_RING/RX_RING_SIZE to SENDQ/RECVQ_SIZE. the ring concept here is only to have a place to save pointers not an actual ring.
       
This patch will address packets retransmission/timeout issues on large cluster environment. Also the performance has dramatically improved by tuning these parameters. It's about 7 times better throughput than default value according to our MPI test results on cluster.

Signed-off-by: Shirley Ma <[EMAIL PROTECTED]>

diff -urN infiniband/ulp/ipoib/ipoib.h infiniband-queue/ulp/ipoib/ipoib.h
--- infiniband/ulp/ipoib/ipoib.h        2006-03-26 11:57:15.000000000 -0800
+++ infiniband-queue/ulp/ipoib/ipoib.h        2006-03-31 08:46:34.171748048 -0800
@@ -66,8 +66,8 @@

        IPOIB_ENCAP_LEN           = 4,

-        IPOIB_RX_RING_SIZE           = 128,
-        IPOIB_TX_RING_SIZE           = 64,
+        IPOIB_SENDQ_SIZE           = 64,
+        IPOIB_RECVQ_SIZE           = 128,

        IPOIB_NUM_WC                   = 4,

@@ -186,6 +186,8 @@
        struct dentry *mcg_dentry;
        struct dentry *path_dentry;
#endif
+        int        sendq_size;
+        int         recvq_size;
};

struct ipoib_ah {
@@ -338,6 +340,8 @@
#define ipoib_warn(priv, format, arg...)                \
        ipoib_printk(KERN_WARNING, priv, format , ## arg)

+extern int ipoib_sendq_size;
+extern int ipoib_recvq_size;

#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
extern int ipoib_debug_level;
diff -urN infiniband/ulp/ipoib/ipoib_ib.c infiniband-queue/ulp/ipoib/ipoib_ib.c
--- infiniband/ulp/ipoib/ipoib_ib.c        2006-03-26 11:57:15.000000000 -0800
+++ infiniband-queue/ulp/ipoib/ipoib_ib.c        2006-03-31 08:46:34.227739536 -0800

@@ -161,7 +161,7 @@
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        int i;

-        for (i = 0; i < IPOIB_RX_RING_SIZE; ++i) {
+        for (i = 0; i < priv->recvq_size; ++i) {
                if (ipoib_alloc_rx_skb(dev, i)) {
                        ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
                        return -ENOMEM;
@@ -187,7 +187,7 @@
        if (wr_id & IPOIB_OP_RECV) {
                wr_id &= ~IPOIB_OP_RECV;

-                if (wr_id < IPOIB_RX_RING_SIZE) {
+                if (wr_id < priv->recvq_size) {
                        struct sk_buff *skb  = priv->rx_ring[wr_id].skb;
                        dma_addr_t      addr = priv->rx_ring[wr_id].mapping;

@@ -252,9 +252,9 @@
                struct ipoib_tx_buf *tx_req;
                unsigned long flags;

-                if (wr_id >= IPOIB_TX_RING_SIZE) {
+                if (wr_id >= priv->sendq_size) {
                        ipoib_warn(priv, "completion event with wrid %d (> %d)\n",
-                                   wr_id, IPOIB_TX_RING_SIZE);
+                                   wr_id, priv->sendq_size);
                        return;
                }

@@ -275,7 +275,7 @@
                spin_lock_irqsave(&priv->tx_lock, flags);
                ++priv->tx_tail;
                if (netif_queue_stopped(dev) &&
-                    priv->tx_head - priv->tx_tail <= IPOIB_TX_RING_SIZE / 2)

+                    priv->tx_head - priv->tx_tail <= priv->sendq_size / 2)
                        netif_wake_queue(dev);
                spin_unlock_irqrestore(&priv->tx_lock, flags);

@@ -344,13 +344,13 @@
         * means we have to make sure everything is properly recorded and
         * our state is consistent before we call post_send().
         */
-        tx_req = &priv->tx_ring[priv->tx_head & (IPOIB_TX_RING_SIZE - 1)];
+        tx_req = &priv->tx_ring[priv->tx_head & (priv->sendq_size - 1)];
        tx_req->skb = skb;
        addr = dma_map_single(priv->ca->dma_device, skb->data, skb->len,
                              DMA_TO_DEVICE);
        pci_unmap_addr_set(tx_req, mapping, addr);

-        if (unlikely(post_send(priv, priv->tx_head & (IPOIB_TX_RING_SIZE - 1),
+        if (unlikely(post_send(priv, priv->tx_head & (priv->sendq_size - 1),
                               address->ah, qpn, addr, skb->len))) {
                ipoib_warn(priv, "post_send failed\n");
                ++priv->stats.tx_errors;
@@ -363,7 +363,7 @@
                address->last_send = priv->tx_head;
                ++priv->tx_head;

-                if (priv->tx_head - priv->tx_tail == IPOIB_TX_RING_SIZE) {
+                if (priv->tx_head - priv->tx_tail == priv->sendq_size) {

                         ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
                        netif_stop_queue(dev);
                }
@@ -488,7 +488,7 @@
        int pending = 0;
        int i;

-        for (i = 0; i < IPOIB_RX_RING_SIZE; ++i)
+        for (i = 0; i < priv->recvq_size; ++i)
                if (priv->rx_ring[i].skb)
                        ++pending;

@@ -527,7 +527,7 @@
                         */
                        while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
                                tx_req = &priv->tx_ring[priv->tx_tail &
-                                                        (IPOIB_TX_RING_SIZE - 1)];
+                                                        (priv->sendq_size - 1)];
                                dma_unmap_single(priv->ca->dma_device,
                                                 pci_unmap_addr(tx_req, mapping),
                                                 tx_req->skb->len,
@@ -536,7 +536,7 @@
                                ++priv->tx_tail;
                        }

-                        for (i = 0; i < IPOIB_RX_RING_SIZE; ++i)
+                        for (i = 0; i < priv->recvq_size; ++i)
                                if (priv->rx_ring[i].skb) {
                                        dma_unmap_single(priv->ca->dma_device,
                                                         pci_unmap_addr(&priv->rx_ring[i],
diff -urN infiniband/ulp/ipoib/ipoib_main.c infiniband-queue/ulp/ipoib/ipoib_main.c
--- infiniband/ulp/ipoib/ipoib_main.c        2006-03-28 19:20:21.000000000 -0800
+++ infiniband-queue/ulp/ipoib/ipoib_main.c        2006-03-31 09:15:06.345458080 -0800

@@ -53,6 +53,17 @@
MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
MODULE_LICENSE("Dual BSD/GPL");

+#define IPOIB_MAX_QUEUE_SIZE        4096        /* max is 4k */
+#define IPOIB_MIN_QUEUE_SIZE    64        /* min is 64 */
+
+int ipoib_sendq_size = IPOIB_SENDQ_SIZE;
+int ipoib_recvq_size = IPOIB_RECVQ_SIZE;
+
+module_param_named(sendq_size, ipoib_sendq_size, int, 0444);
+MODULE_PARM_DESC(sendq_size, "Number of wqe in send queue");
+module_param_named(recvq_size, ipoib_recvq_size, int, 0444);
+MODULE_PARM_DESC(recvq_size, "Number of wqe in receive queue");
+
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
int ipoib_debug_level;

@@ -837,27 +848,61 @@
        return 0;
}

+static int expsize(int size)
+{        
+        int expsize_t = 1;
+        int j = 1;
+        while (size / 2 >= expsize_t) {
+                expsize_t = 1 << ++j;
+        }
+        return expsize_t;
+}
+
int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
{
        struct ipoib_dev_priv *priv = netdev_priv(dev);

        /* Allocate RX/TX "rings" to hold queued skbs */

-        priv->rx_ring =        kzalloc(IPOIB_RX_RING_SIZE * sizeof (struct ipoib_rx_buf),

+        if (ipoib_recvq_size > IPOIB_MAX_QUEUE_SIZE) {
+                ipoib_recvq_size = IPOIB_MAX_QUEUE_SIZE;
+                 printk(KERN_WARNING "%s: ipoib_recvq_size is too big, use max %d instead\n", ca->name, IPOIB_MAX_QUEUE_SIZE);
+        }
+        if (ipoib_recvq_size < IPOIB_MIN_QUEUE_SIZE) {
+                ipoib_recvq_size = IPOIB_MIN_QUEUE_SIZE;
+                printk(KERN_WARNING "%s: ipoib_recvq_size is too small, use min %d instead\n", ca->name, IPOIB_MIN_QUEUE_SIZE);
+        }
+        priv->recvq_size = expsize(ipoib_recvq_size);
+        priv->rx_ring =        kzalloc(priv->recvq_size * sizeof (struct ipoib_rx_buf),
                                GFP_KERNEL);
        if (!priv->rx_ring) {
                printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
-                       ca->name, IPOIB_RX_RING_SIZE);
+                       ca->name, priv->sendq_size);
                goto out;
        }
+        printk(KERN_INFO "%s: RX_RING_SIZE is set to %d entries\n",
+               ca->name, priv->recvq_size);
+
+        if (ipoib_sendq_size > IPOIB_MAX_QUEUE_SIZE) {
+                ipoib_sendq_size = IPOIB_MAX_QUEUE_SIZE;
+                printk(KERN_WARNING "%s: ipoib_sendq_size is too big, use max %d instead\n", ca->name, IPOIB_MAX_QUEUE_SIZE);

+        }
+        if (ipoib_sendq_size < IPOIB_MIN_QUEUE_SIZE) {
+                ipoib_sendq_size = IPOIB_MIN_QUEUE_SIZE;
+                printk(KERN_WARNING "%s: ipoib_recvq_size is too small, use min %d instead\n", ca->name, IPOIB_MIN_QUEUE_SIZE);
+        }        
+
+        priv->sendq_size = expsize(ipoib_sendq_size);

-        priv->tx_ring = kzalloc(IPOIB_TX_RING_SIZE * sizeof (struct ipoib_tx_buf),
+        priv->tx_ring = kzalloc(priv->sendq_size * sizeof (struct ipoib_tx_buf),
                                GFP_KERNEL);
        if (!priv->tx_ring) {
                printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
-                       ca->name, IPOIB_TX_RING_SIZE);
+                       ca->name, priv->sendq_size);
                goto out_rx_ring_cleanup;
        }
+        printk(KERN_INFO "%s: TX_RING_SIZE is set to %d entries\n",
+               ca->name, priv->sendq_size);

        /* priv->tx_head & tx_tail are already 0 */

@@ -923,7 +968,7 @@
        dev->hard_header_len          = IPOIB_ENCAP_LEN + INFINIBAND_ALEN;
        dev->addr_len                  = INFINIBAND_ALEN;
        dev->type                  = ARPHRD_INFINIBAND;
-        dev->tx_queue_len          = IPOIB_TX_RING_SIZE * 2;
+        dev->tx_queue_len          = priv->sendq_size * 2;

         dev->features            = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;

        /* MTU will be reset when mcast join happens */
diff -urN infiniband/ulp/ipoib/ipoib_verbs.c infiniband-queue/ulp/ipoib/ipoib_verbs.c
--- infiniband/ulp/ipoib/ipoib_verbs.c        2006-03-26 11:57:15.000000000 -0800
+++ infiniband-queue/ulp/ipoib/ipoib_verbs.c        2006-03-31 08:46:34.308727224 -0800
@@ -159,8 +159,8 @@
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        struct ib_qp_init_attr init_attr = {
                .cap = {
-                        .max_send_wr  = IPOIB_TX_RING_SIZE,
-                        .max_recv_wr  = IPOIB_RX_RING_SIZE,
+                        .max_send_wr  = priv->sendq_size,
+                        .max_recv_wr  = priv->recvq_size,
                        .max_send_sge = 1,
                        .max_recv_sge = 1
                },
@@ -175,7 +175,7 @@
        }

        priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev,
-                                IPOIB_TX_RING_SIZE + IPOIB_RX_RING_SIZE + 1);
+                                priv->sendq_size + priv->recvq_size + 1);
        if (IS_ERR(priv->cq)) {
                printk(KERN_WARNING "%s: failed to create CQ\n", ca->name);
                goto out_free_pd;






Thanks
Shirley Ma
IBM Linux Technology Center
15300 SW Koll Parkway
Beaverton, OR 97006-6063
Phone(Fax): (503) 578-7638

Attachment: infiniband-tune-queue.patch
Description: Binary data

_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to