This patch enhances RX buffer handling in the mana driver by allocating
pages from a page pool and slicing them into MTU-sized fragments, rather
than dedicating a full page per packet. This approach is especially
beneficial on systems with 64KB page sizes.

Key improvements:

- Proper integration of page pool for RX buffer allocations.
- MTU-sized buffer slicing to improve memory utilization.
- Reduce overall per Rx queue memory footprint.
- Automatic fallback to full-page buffers when:
   * Jumbo frames are enabled (MTU > PAGE_SIZE / 2).
   * The XDP path is active, to avoid complexities with fragment reuse.
- Removal of redundant pre-allocated RX buffers used in scenarios like MTU
  changes, ensuring consistency in RX buffer allocation.

Testing on VMs with 64KB pages shows around 200% throughput improvement.
Memory efficiency is significantly improved due to reduced wastage in page
allocations. Example: We are now able to fit 35 Rx buffers in a single 64KB
page for MTU size of 1500, instead of 1 Rx buffer per page previously.

Tested:

- iperf3, iperf2, and nttcp benchmarks.
- Jumbo frames with MTU 9000.
- Native XDP programs (XDP_PASS, XDP_DROP, XDP_TX, XDP_REDIRECT) for
  testing the driver???s XDP path.
- Page leak detection (kmemleak).
- Driver load/unload, reboot, and stress scenarios.

Signed-off-by: Dipayaan Roy <dipayan...@linux.microsoft.com>
---
 .../net/ethernet/microsoft/mana/mana_bpf.c    |  22 +-
 drivers/net/ethernet/microsoft/mana/mana_en.c | 284 ++++++------------
 .../ethernet/microsoft/mana/mana_ethtool.c    |  13 -
 include/net/mana/mana.h                       |  13 +-
 4 files changed, 115 insertions(+), 217 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_bpf.c 
b/drivers/net/ethernet/microsoft/mana/mana_bpf.c
index d30721d4516f..96813b6c184f 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_bpf.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_bpf.c
@@ -174,6 +174,7 @@ static int mana_xdp_set(struct net_device *ndev, struct 
bpf_prog *prog,
        struct mana_port_context *apc = netdev_priv(ndev);
        struct bpf_prog *old_prog;
        struct gdma_context *gc;
+       int err;
 
        gc = apc->ac->gdma_dev->gdma_context;
 
@@ -198,14 +199,33 @@ static int mana_xdp_set(struct net_device *ndev, struct 
bpf_prog *prog,
        if (old_prog)
                bpf_prog_put(old_prog);
 
-       if (apc->port_is_up)
+       if (apc->port_is_up) {
+               /* Re-create rxq's after xdp prog was loaded or unloaded.
+                * Ex: re create rxq's to switch from full pages to smaller
+                * size page fragments when xdp prog is unloaded and vice-versa.
+                */
+
+               err = mana_detach(ndev, false);
+               if (err) {
+                       netdev_err(ndev, "mana_detach failed at xdp set: %d\n", 
err);
+                       goto out;
+               }
+
+               err = mana_attach(ndev);
+               if (err) {
+                       netdev_err(ndev, "mana_attach failed at xdp set: %d\n", 
err);
+                       goto out;
+               }
+
                mana_chn_setxdp(apc, prog);
+       }
 
        if (prog)
                ndev->max_mtu = MANA_XDP_MTU_MAX;
        else
                ndev->max_mtu = gc->adapter_mtu - ETH_HLEN;
 
+out:
        return 0;
 }
 
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c 
b/drivers/net/ethernet/microsoft/mana/mana_en.c
index a7973651ae51..a474c59c907c 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -548,171 +548,45 @@ static u16 mana_select_queue(struct net_device *ndev, 
struct sk_buff *skb,
        return txq;
 }
 
-/* Release pre-allocated RX buffers */
-void mana_pre_dealloc_rxbufs(struct mana_port_context *mpc)
-{
-       struct device *dev;
-       int i;
-
-       dev = mpc->ac->gdma_dev->gdma_context->dev;
-
-       if (!mpc->rxbufs_pre)
-               goto out1;
-
-       if (!mpc->das_pre)
-               goto out2;
-
-       while (mpc->rxbpre_total) {
-               i = --mpc->rxbpre_total;
-               dma_unmap_single(dev, mpc->das_pre[i], mpc->rxbpre_datasize,
-                                DMA_FROM_DEVICE);
-               put_page(virt_to_head_page(mpc->rxbufs_pre[i]));
-       }
-
-       kfree(mpc->das_pre);
-       mpc->das_pre = NULL;
-
-out2:
-       kfree(mpc->rxbufs_pre);
-       mpc->rxbufs_pre = NULL;
-
-out1:
-       mpc->rxbpre_datasize = 0;
-       mpc->rxbpre_alloc_size = 0;
-       mpc->rxbpre_headroom = 0;
-}
-
-/* Get a buffer from the pre-allocated RX buffers */
-static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da)
-{
-       struct net_device *ndev = rxq->ndev;
-       struct mana_port_context *mpc;
-       void *va;
-
-       mpc = netdev_priv(ndev);
-
-       if (!mpc->rxbufs_pre || !mpc->das_pre || !mpc->rxbpre_total) {
-               netdev_err(ndev, "No RX pre-allocated bufs\n");
-               return NULL;
-       }
-
-       /* Check sizes to catch unexpected coding error */
-       if (mpc->rxbpre_datasize != rxq->datasize) {
-               netdev_err(ndev, "rxbpre_datasize mismatch: %u: %u\n",
-                          mpc->rxbpre_datasize, rxq->datasize);
-               return NULL;
-       }
-
-       if (mpc->rxbpre_alloc_size != rxq->alloc_size) {
-               netdev_err(ndev, "rxbpre_alloc_size mismatch: %u: %u\n",
-                          mpc->rxbpre_alloc_size, rxq->alloc_size);
-               return NULL;
-       }
-
-       if (mpc->rxbpre_headroom != rxq->headroom) {
-               netdev_err(ndev, "rxbpre_headroom mismatch: %u: %u\n",
-                          mpc->rxbpre_headroom, rxq->headroom);
-               return NULL;
-       }
-
-       mpc->rxbpre_total--;
-
-       *da = mpc->das_pre[mpc->rxbpre_total];
-       va = mpc->rxbufs_pre[mpc->rxbpre_total];
-       mpc->rxbufs_pre[mpc->rxbpre_total] = NULL;
-
-       /* Deallocate the array after all buffers are gone */
-       if (!mpc->rxbpre_total)
-               mana_pre_dealloc_rxbufs(mpc);
-
-       return va;
-}
-
 /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */
-static void mana_get_rxbuf_cfg(int mtu, u32 *datasize, u32 *alloc_size,
-                              u32 *headroom)
+static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
+                              int mtu, u32 *datasize, u32 *alloc_size,
+                              u32 *headroom, u32 *frag_count)
 {
-       if (mtu > MANA_XDP_MTU_MAX)
-               *headroom = 0; /* no support for XDP */
-       else
-               *headroom = XDP_PACKET_HEADROOM;
-
-       *alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom);
-
-       /* Using page pool in this case, so alloc_size is PAGE_SIZE */
-       if (*alloc_size < PAGE_SIZE)
-               *alloc_size = PAGE_SIZE;
-
+       /* Calculate datasize first (consistent across all cases) */
        *datasize = mtu + ETH_HLEN;
-}
-
-int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu, int 
num_queues)
-{
-       struct device *dev;
-       struct page *page;
-       dma_addr_t da;
-       int num_rxb;
-       void *va;
-       int i;
-
-       mana_get_rxbuf_cfg(new_mtu, &mpc->rxbpre_datasize,
-                          &mpc->rxbpre_alloc_size, &mpc->rxbpre_headroom);
-
-       dev = mpc->ac->gdma_dev->gdma_context->dev;
-
-       num_rxb = num_queues * mpc->rx_queue_size;
-
-       WARN(mpc->rxbufs_pre, "mana rxbufs_pre exists\n");
-       mpc->rxbufs_pre = kmalloc_array(num_rxb, sizeof(void *), GFP_KERNEL);
-       if (!mpc->rxbufs_pre)
-               goto error;
 
-       mpc->das_pre = kmalloc_array(num_rxb, sizeof(dma_addr_t), GFP_KERNEL);
-       if (!mpc->das_pre)
-               goto error;
-
-       mpc->rxbpre_total = 0;
-
-       for (i = 0; i < num_rxb; i++) {
-               page = dev_alloc_pages(get_order(mpc->rxbpre_alloc_size));
-               if (!page)
-                       goto error;
-
-               va = page_to_virt(page);
-
-               da = dma_map_single(dev, va + mpc->rxbpre_headroom,
-                                   mpc->rxbpre_datasize, DMA_FROM_DEVICE);
-               if (dma_mapping_error(dev, da)) {
-                       put_page(page);
-                       goto error;
+       /* For xdp and jumbo frames make sure only one packet fits per page */
+       if (((mtu + MANA_RXBUF_PAD) > PAGE_SIZE / 2) || 
rcu_access_pointer(apc->bpf_prog)) {
+               if (rcu_access_pointer(apc->bpf_prog)) {
+                       *headroom = XDP_PACKET_HEADROOM;
+                       *alloc_size = PAGE_SIZE;
+               } else {
+                       *headroom = 0; /* no support for XDP */
+                       *alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + 
*headroom);
                }
 
-               mpc->rxbufs_pre[i] = va;
-               mpc->das_pre[i] = da;
-               mpc->rxbpre_total = i + 1;
+               *frag_count = 1;
+               return;
        }
 
-       return 0;
+       /* Standard MTU case - optimize for multiple packets per page */
+       *headroom = 0;
 
-error:
-       netdev_err(mpc->ndev, "Failed to pre-allocate RX buffers for %d 
queues\n", num_queues);
-       mana_pre_dealloc_rxbufs(mpc);
-       return -ENOMEM;
+       /* Calculate base buffer size needed */
+       u32 len = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom);
+       u32 buf_size = ALIGN(len, MANA_RX_FRAG_ALIGNMENT);
+
+       /* Calculate how many packets can fit in a page */
+       *frag_count = PAGE_SIZE / buf_size;
+       *alloc_size = buf_size;
 }
 
 static int mana_change_mtu(struct net_device *ndev, int new_mtu)
 {
-       struct mana_port_context *mpc = netdev_priv(ndev);
        unsigned int old_mtu = ndev->mtu;
        int err;
 
-       /* Pre-allocate buffers to prevent failure in mana_attach later */
-       err = mana_pre_alloc_rxbufs(mpc, new_mtu, mpc->num_queues);
-       if (err) {
-               netdev_err(ndev, "Insufficient memory for new MTU\n");
-               return err;
-       }
-
        err = mana_detach(ndev, false);
        if (err) {
                netdev_err(ndev, "mana_detach failed: %d\n", err);
@@ -728,7 +602,6 @@ static int mana_change_mtu(struct net_device *ndev, int 
new_mtu)
        }
 
 out:
-       mana_pre_dealloc_rxbufs(mpc);
        return err;
 }
 
@@ -1841,8 +1714,11 @@ static void mana_rx_skb(void *buf_va, bool from_pool,
 
 drop:
        if (from_pool) {
-               page_pool_recycle_direct(rxq->page_pool,
-                                        virt_to_head_page(buf_va));
+               if (rxq->frag_count == 1)
+                       page_pool_recycle_direct(rxq->page_pool,
+                                                virt_to_head_page(buf_va));
+               else
+                       page_pool_free_va(rxq->page_pool, buf_va, true);
        } else {
                WARN_ON_ONCE(rxq->xdp_save_va);
                /* Save for reuse */
@@ -1854,37 +1730,50 @@ static void mana_rx_skb(void *buf_va, bool from_pool,
        return;
 }
 
-static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
-                            dma_addr_t *da, bool *from_pool)
+static void *mana_get_rxfrag(struct mana_rxq *rxq,
+                            struct device *dev, dma_addr_t *da, bool 
*from_pool)
 {
        struct page *page;
+       u32 offset;
        void *va;
-
        *from_pool = false;
 
-       /* Reuse XDP dropped page if available */
-       if (rxq->xdp_save_va) {
-               va = rxq->xdp_save_va;
-               rxq->xdp_save_va = NULL;
-       } else {
-               page = page_pool_dev_alloc_pages(rxq->page_pool);
-               if (!page)
+       /* Don't use fragments for jumbo frames or XDP (i.e when fragment = 1 
per page) */
+       if (rxq->frag_count == 1) {
+               /* Reuse XDP dropped page if available */
+               if (rxq->xdp_save_va) {
+                       va = rxq->xdp_save_va;
+                       rxq->xdp_save_va = NULL;
+               } else {
+                       page = page_pool_dev_alloc_pages(rxq->page_pool);
+                       if (!page)
+                               return NULL;
+
+                       *from_pool = true;
+                       va = page_to_virt(page);
+               }
+
+               *da = dma_map_single(dev, va + rxq->headroom, rxq->datasize,
+                                    DMA_FROM_DEVICE);
+               if (dma_mapping_error(dev, *da)) {
+                       if (*from_pool)
+                               page_pool_put_full_page(rxq->page_pool, page, 
false);
+                       else
+                               put_page(virt_to_head_page(va));
+
                        return NULL;
+               }
 
-               *from_pool = true;
-               va = page_to_virt(page);
+               return va;
        }
 
-       *da = dma_map_single(dev, va + rxq->headroom, rxq->datasize,
-                            DMA_FROM_DEVICE);
-       if (dma_mapping_error(dev, *da)) {
-               if (*from_pool)
-                       page_pool_put_full_page(rxq->page_pool, page, false);
-               else
-                       put_page(virt_to_head_page(va));
-
+       page =  page_pool_dev_alloc_frag(rxq->page_pool, &offset, 
rxq->alloc_size);
+       if (!page)
                return NULL;
-       }
+
+       va  = page_to_virt(page) + offset;
+       *da = page_pool_get_dma_addr(page) + offset + rxq->headroom;
+       *from_pool = true;
 
        return va;
 }
@@ -1901,9 +1790,9 @@ static void mana_refill_rx_oob(struct device *dev, struct 
mana_rxq *rxq,
        va = mana_get_rxfrag(rxq, dev, &da, &from_pool);
        if (!va)
                return;
-
-       dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize,
-                        DMA_FROM_DEVICE);
+       if (!rxoob->from_pool || rxq->frag_count == 1)
+               dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize,
+                                DMA_FROM_DEVICE);
        *old_buf = rxoob->buf_va;
        *old_fp = rxoob->from_pool;
 
@@ -2314,15 +2203,19 @@ static void mana_destroy_rxq(struct mana_port_context 
*apc,
                if (!rx_oob->buf_va)
                        continue;
 
-               dma_unmap_single(dev, rx_oob->sgl[0].address,
-                                rx_oob->sgl[0].size, DMA_FROM_DEVICE);
-
                page = virt_to_head_page(rx_oob->buf_va);
 
-               if (rx_oob->from_pool)
-                       page_pool_put_full_page(rxq->page_pool, page, false);
-               else
-                       put_page(page);
+               if (rxq->frag_count == 1) {
+                       dma_unmap_single(dev, rx_oob->sgl[0].address, 
rx_oob->sgl[0].size,
+                                        DMA_FROM_DEVICE);
+
+                       if (rx_oob->from_pool)
+                               page_pool_put_full_page(rxq->page_pool, page, 
false);
+                       else
+                               put_page(page);
+               } else {
+                       page_pool_free_va(rxq->page_pool, rx_oob->buf_va, true);
+               }
 
                rx_oob->buf_va = NULL;
        }
@@ -2338,16 +2231,11 @@ static void mana_destroy_rxq(struct mana_port_context 
*apc,
 static int mana_fill_rx_oob(struct mana_recv_buf_oob *rx_oob, u32 mem_key,
                            struct mana_rxq *rxq, struct device *dev)
 {
-       struct mana_port_context *mpc = netdev_priv(rxq->ndev);
        bool from_pool = false;
        dma_addr_t da;
        void *va;
 
-       if (mpc->rxbufs_pre)
-               va = mana_get_rxbuf_pre(rxq, &da);
-       else
-               va = mana_get_rxfrag(rxq, dev, &da, &from_pool);
-
+       va = mana_get_rxfrag(rxq, dev, &da, &from_pool);
        if (!va)
                return -ENOMEM;
 
@@ -2428,11 +2316,22 @@ static int mana_create_page_pool(struct mana_rxq *rxq, 
struct gdma_context *gc)
        struct page_pool_params pprm = {};
        int ret;
 
-       pprm.pool_size = mpc->rx_queue_size;
+       pprm.pool_size = mpc->rx_queue_size / rxq->frag_count + 1;
        pprm.nid = gc->numa_node;
        pprm.napi = &rxq->rx_cq.napi;
        pprm.netdev = rxq->ndev;
        pprm.order = get_order(rxq->alloc_size);
+       pprm.queue_idx = rxq->rxq_idx;
+       pprm.dev = gc->dev;
+
+       /* Let the page pool do the dma map when page sharing with multiple 
fragments
+        * enabled for rx buffers.
+        */
+       if (rxq->frag_count > 1) {
+               pprm.flags =  PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
+               pprm.max_len = PAGE_SIZE;
+               pprm.dma_dir = DMA_FROM_DEVICE;
+       }
 
        rxq->page_pool = page_pool_create(&pprm);
 
@@ -2471,9 +2370,8 @@ static struct mana_rxq *mana_create_rxq(struct 
mana_port_context *apc,
        rxq->rxq_idx = rxq_idx;
        rxq->rxobj = INVALID_MANA_HANDLE;
 
-       mana_get_rxbuf_cfg(ndev->mtu, &rxq->datasize, &rxq->alloc_size,
-                          &rxq->headroom);
-
+       mana_get_rxbuf_cfg(apc, ndev->mtu, &rxq->datasize, &rxq->alloc_size,
+                          &rxq->headroom, &rxq->frag_count);
        /* Create page pool for RX queue */
        err = mana_create_page_pool(rxq, gc);
        if (err) {
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c 
b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index a1afa75a9463..7ede03c74fb9 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -396,12 +396,6 @@ static int mana_set_channels(struct net_device *ndev,
        unsigned int old_count = apc->num_queues;
        int err;
 
-       err = mana_pre_alloc_rxbufs(apc, ndev->mtu, new_count);
-       if (err) {
-               netdev_err(ndev, "Insufficient memory for new allocations");
-               return err;
-       }
-
        err = mana_detach(ndev, false);
        if (err) {
                netdev_err(ndev, "mana_detach failed: %d\n", err);
@@ -416,7 +410,6 @@ static int mana_set_channels(struct net_device *ndev,
        }
 
 out:
-       mana_pre_dealloc_rxbufs(apc);
        return err;
 }
 
@@ -465,12 +458,7 @@ static int mana_set_ringparam(struct net_device *ndev,
 
        /* pre-allocating new buffers to prevent failures in mana_attach() 
later */
        apc->rx_queue_size = new_rx;
-       err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues);
        apc->rx_queue_size = old_rx;
-       if (err) {
-               netdev_err(ndev, "Insufficient memory for new allocations\n");
-               return err;
-       }
 
        err = mana_detach(ndev, false);
        if (err) {
@@ -488,7 +476,6 @@ static int mana_set_ringparam(struct net_device *ndev,
                apc->rx_queue_size = old_rx;
        }
 out:
-       mana_pre_dealloc_rxbufs(apc);
        return err;
 }
 
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index e1030a7d2daa..99a3847b0f9d 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -65,6 +65,8 @@ enum TRI_STATE {
 #define MANA_STATS_RX_COUNT 5
 #define MANA_STATS_TX_COUNT 11
 
+#define MANA_RX_FRAG_ALIGNMENT 64
+
 struct mana_stats_rx {
        u64 packets;
        u64 bytes;
@@ -328,6 +330,7 @@ struct mana_rxq {
        u32 datasize;
        u32 alloc_size;
        u32 headroom;
+       u32 frag_count;
 
        mana_handle_t rxobj;
 
@@ -503,14 +506,6 @@ struct mana_port_context {
        /* This points to an array of num_queues of RQ pointers. */
        struct mana_rxq **rxqs;
 
-       /* pre-allocated rx buffer array */
-       void **rxbufs_pre;
-       dma_addr_t *das_pre;
-       int rxbpre_total;
-       u32 rxbpre_datasize;
-       u32 rxbpre_alloc_size;
-       u32 rxbpre_headroom;
-
        struct bpf_prog *bpf_prog;
 
        /* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
@@ -574,8 +569,6 @@ int mana_query_link_cfg(struct mana_port_context *apc);
 int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed,
                      int enable_clamping);
 void mana_query_phy_stats(struct mana_port_context *apc);
-int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int 
num_queues);
-void mana_pre_dealloc_rxbufs(struct mana_port_context *apc);
 
 extern const struct ethtool_ops mana_ethtool_ops;
 extern struct dentry *mana_debugfs_root;
-- 
2.43.0


Reply via email to