Thanks Saurabh, I am sending a v2. Regards Dipayaan Roy
On Wed, Jul 23, 2025 at 12:38:04AM -0700, Saurabh Singh Sengar wrote: > On Mon, Jul 21, 2025 at 03:14:17AM -0700, Dipayaan Roy wrote: > > This patch enhances RX buffer handling in the mana driver by allocating > > pages from a page pool and slicing them into MTU-sized fragments, rather > > than dedicating a full page per packet. This approach is especially > > beneficial on systems with 64KB page sizes. > > > > Key improvements: > > > > - Proper integration of page pool for RX buffer allocations. > > - MTU-sized buffer slicing to improve memory utilization. > > - Reduce overall per Rx queue memory footprint. > > - Automatic fallback to full-page buffers when: > > * Jumbo frames are enabled (MTU > PAGE_SIZE / 2). > > * The XDP path is active, to avoid complexities with fragment reuse. > > - Removal of redundant pre-allocated RX buffers used in scenarios like MTU > > changes, ensuring consistency in RX buffer allocation. > > > > Testing on VMs with 64KB pages shows around 200% throughput improvement. > > Memory efficiency is significantly improved due to reduced wastage in page > > allocations. Example: We are now able to fit 35 Rx buffers in a single 64KB > > page for MTU size of 1500, instead of 1 Rx buffer per page previously. > > > > Tested: > > > > - iperf3, iperf2, and nttcp benchmarks. > > - Jumbo frames with MTU 9000. > > - Native XDP programs (XDP_PASS, XDP_DROP, XDP_TX, XDP_REDIRECT) for > > testing the driver???s XDP path. > > - Page leak detection (kmemleak). > > - Driver load/unload, reboot, and stress scenarios. > > > > Signed-off-by: Dipayaan Roy <dipayan...@linux.microsoft.com> > > --- > > .../net/ethernet/microsoft/mana/mana_bpf.c | 22 +- > > drivers/net/ethernet/microsoft/mana/mana_en.c | 284 ++++++------------ > > .../ethernet/microsoft/mana/mana_ethtool.c | 13 - > > include/net/mana/mana.h | 13 +- > > 4 files changed, 115 insertions(+), 217 deletions(-) > > > > diff --git a/drivers/net/ethernet/microsoft/mana/mana_bpf.c > > b/drivers/net/ethernet/microsoft/mana/mana_bpf.c > > index d30721d4516f..96813b6c184f 100644 > > --- a/drivers/net/ethernet/microsoft/mana/mana_bpf.c > > +++ b/drivers/net/ethernet/microsoft/mana/mana_bpf.c > > @@ -174,6 +174,7 @@ static int mana_xdp_set(struct net_device *ndev, struct > > bpf_prog *prog, > > struct mana_port_context *apc = netdev_priv(ndev); > > struct bpf_prog *old_prog; > > struct gdma_context *gc; > > + int err; > > > > gc = apc->ac->gdma_dev->gdma_context; > > > > @@ -198,14 +199,33 @@ static int mana_xdp_set(struct net_device *ndev, > > struct bpf_prog *prog, > > if (old_prog) > > bpf_prog_put(old_prog); > > > > - if (apc->port_is_up) > > + if (apc->port_is_up) { > > + /* Re-create rxq's after xdp prog was loaded or unloaded. > > + * Ex: re create rxq's to switch from full pages to smaller > > + * size page fragments when xdp prog is unloaded and vice-versa. > > + */ > > + > > + err = mana_detach(ndev, false); > > + if (err) { > > + netdev_err(ndev, "mana_detach failed at xdp set: %d\n", > > err); > > + goto out; > > You should return err. At out we are always returning 0 which is wrong. > > > + } > > + > > + err = mana_attach(ndev); > > + if (err) { > > + netdev_err(ndev, "mana_attach failed at xdp set: %d\n", > > err); > > + goto out; > > same here. > > > + } > > + > > mana_chn_setxdp(apc, prog); > > + } > > > > if (prog) > > ndev->max_mtu = MANA_XDP_MTU_MAX; > > else > > ndev->max_mtu = gc->adapter_mtu - ETH_HLEN; > > > > +out: > > return 0; > > } > > > > diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c > > b/drivers/net/ethernet/microsoft/mana/mana_en.c > > index a7973651ae51..a474c59c907c 100644 > > --- a/drivers/net/ethernet/microsoft/mana/mana_en.c > > +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c > > @@ -548,171 +548,45 @@ static u16 mana_select_queue(struct net_device > > *ndev, struct sk_buff *skb, > > return txq; > > } > > > > -/* Release pre-allocated RX buffers */ > > -void mana_pre_dealloc_rxbufs(struct mana_port_context *mpc) > > -{ > > - struct device *dev; > > - int i; > > - > > - dev = mpc->ac->gdma_dev->gdma_context->dev; > > - > > - if (!mpc->rxbufs_pre) > > - goto out1; > > - > > - if (!mpc->das_pre) > > - goto out2; > > - > > - while (mpc->rxbpre_total) { > > - i = --mpc->rxbpre_total; > > - dma_unmap_single(dev, mpc->das_pre[i], mpc->rxbpre_datasize, > > - DMA_FROM_DEVICE); > > - put_page(virt_to_head_page(mpc->rxbufs_pre[i])); > > - } > > - > > - kfree(mpc->das_pre); > > - mpc->das_pre = NULL; > > - > > -out2: > > - kfree(mpc->rxbufs_pre); > > - mpc->rxbufs_pre = NULL; > > - > > -out1: > > - mpc->rxbpre_datasize = 0; > > - mpc->rxbpre_alloc_size = 0; > > - mpc->rxbpre_headroom = 0; > > -} > > - > > -/* Get a buffer from the pre-allocated RX buffers */ > > -static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da) > > -{ > > - struct net_device *ndev = rxq->ndev; > > - struct mana_port_context *mpc; > > - void *va; > > - > > - mpc = netdev_priv(ndev); > > - > > - if (!mpc->rxbufs_pre || !mpc->das_pre || !mpc->rxbpre_total) { > > - netdev_err(ndev, "No RX pre-allocated bufs\n"); > > - return NULL; > > - } > > - > > - /* Check sizes to catch unexpected coding error */ > > - if (mpc->rxbpre_datasize != rxq->datasize) { > > - netdev_err(ndev, "rxbpre_datasize mismatch: %u: %u\n", > > - mpc->rxbpre_datasize, rxq->datasize); > > - return NULL; > > - } > > - > > - if (mpc->rxbpre_alloc_size != rxq->alloc_size) { > > - netdev_err(ndev, "rxbpre_alloc_size mismatch: %u: %u\n", > > - mpc->rxbpre_alloc_size, rxq->alloc_size); > > - return NULL; > > - } > > - > > - if (mpc->rxbpre_headroom != rxq->headroom) { > > - netdev_err(ndev, "rxbpre_headroom mismatch: %u: %u\n", > > - mpc->rxbpre_headroom, rxq->headroom); > > - return NULL; > > - } > > - > > - mpc->rxbpre_total--; > > - > > - *da = mpc->das_pre[mpc->rxbpre_total]; > > - va = mpc->rxbufs_pre[mpc->rxbpre_total]; > > - mpc->rxbufs_pre[mpc->rxbpre_total] = NULL; > > - > > - /* Deallocate the array after all buffers are gone */ > > - if (!mpc->rxbpre_total) > > - mana_pre_dealloc_rxbufs(mpc); > > - > > - return va; > > -} > > - > > /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */ > > -static void mana_get_rxbuf_cfg(int mtu, u32 *datasize, u32 *alloc_size, > > - u32 *headroom) > > +static void mana_get_rxbuf_cfg(struct mana_port_context *apc, > > + int mtu, u32 *datasize, u32 *alloc_size, > > + u32 *headroom, u32 *frag_count) > > { > > - if (mtu > MANA_XDP_MTU_MAX) > > - *headroom = 0; /* no support for XDP */ > > - else > > - *headroom = XDP_PACKET_HEADROOM; > > - > > - *alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom); > > - > > - /* Using page pool in this case, so alloc_size is PAGE_SIZE */ > > - if (*alloc_size < PAGE_SIZE) > > - *alloc_size = PAGE_SIZE; > > - > > + /* Calculate datasize first (consistent across all cases) */ > > *datasize = mtu + ETH_HLEN; > > -} > > - > > -int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu, int > > num_queues) > > -{ > > - struct device *dev; > > - struct page *page; > > - dma_addr_t da; > > - int num_rxb; > > - void *va; > > - int i; > > - > > - mana_get_rxbuf_cfg(new_mtu, &mpc->rxbpre_datasize, > > - &mpc->rxbpre_alloc_size, &mpc->rxbpre_headroom); > > - > > - dev = mpc->ac->gdma_dev->gdma_context->dev; > > - > > - num_rxb = num_queues * mpc->rx_queue_size; > > - > > - WARN(mpc->rxbufs_pre, "mana rxbufs_pre exists\n"); > > - mpc->rxbufs_pre = kmalloc_array(num_rxb, sizeof(void *), GFP_KERNEL); > > - if (!mpc->rxbufs_pre) > > - goto error; > > > > - mpc->das_pre = kmalloc_array(num_rxb, sizeof(dma_addr_t), GFP_KERNEL); > > - if (!mpc->das_pre) > > - goto error; > > - > > - mpc->rxbpre_total = 0; > > - > > - for (i = 0; i < num_rxb; i++) { > > - page = dev_alloc_pages(get_order(mpc->rxbpre_alloc_size)); > > - if (!page) > > - goto error; > > - > > - va = page_to_virt(page); > > - > > - da = dma_map_single(dev, va + mpc->rxbpre_headroom, > > - mpc->rxbpre_datasize, DMA_FROM_DEVICE); > > - if (dma_mapping_error(dev, da)) { > > - put_page(page); > > - goto error; > > + /* For xdp and jumbo frames make sure only one packet fits per page */ > > + if (((mtu + MANA_RXBUF_PAD) > PAGE_SIZE / 2) || > > rcu_access_pointer(apc->bpf_prog)) { > > + if (rcu_access_pointer(apc->bpf_prog)) { > > + *headroom = XDP_PACKET_HEADROOM; > > + *alloc_size = PAGE_SIZE; > > + } else { > > + *headroom = 0; /* no support for XDP */ > > + *alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + > > *headroom); > > } > > > > - mpc->rxbufs_pre[i] = va; > > - mpc->das_pre[i] = da; > > - mpc->rxbpre_total = i + 1; > > + *frag_count = 1; > > + return; > > } > > > > - return 0; > > + /* Standard MTU case - optimize for multiple packets per page */ > > + *headroom = 0; > > > > -error: > > - netdev_err(mpc->ndev, "Failed to pre-allocate RX buffers for %d > > queues\n", num_queues); > > - mana_pre_dealloc_rxbufs(mpc); > > - return -ENOMEM; > > + /* Calculate base buffer size needed */ > > + u32 len = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom); > > + u32 buf_size = ALIGN(len, MANA_RX_FRAG_ALIGNMENT); > > its good to have all the declaration at start of function. > > > + > > + /* Calculate how many packets can fit in a page */ > > + *frag_count = PAGE_SIZE / buf_size; > > + *alloc_size = buf_size; > > } > > > > static int mana_change_mtu(struct net_device *ndev, int new_mtu) > > { > > - struct mana_port_context *mpc = netdev_priv(ndev); > > unsigned int old_mtu = ndev->mtu; > > int err; > > > > - /* Pre-allocate buffers to prevent failure in mana_attach later */ > > - err = mana_pre_alloc_rxbufs(mpc, new_mtu, mpc->num_queues); > > - if (err) { > > - netdev_err(ndev, "Insufficient memory for new MTU\n"); > > - return err; > > - } > > - > > err = mana_detach(ndev, false); > > if (err) { > > netdev_err(ndev, "mana_detach failed: %d\n", err); > > @@ -728,7 +602,6 @@ static int mana_change_mtu(struct net_device *ndev, int > > new_mtu) > > } > > > > out: > > - mana_pre_dealloc_rxbufs(mpc); > > return err; > > } > > > > @@ -1841,8 +1714,11 @@ static void mana_rx_skb(void *buf_va, bool from_pool, > > > > drop: > > if (from_pool) { > > - page_pool_recycle_direct(rxq->page_pool, > > - virt_to_head_page(buf_va)); > > + if (rxq->frag_count == 1) > > + page_pool_recycle_direct(rxq->page_pool, > > + virt_to_head_page(buf_va)); > > + else > > + page_pool_free_va(rxq->page_pool, buf_va, true); > > } else { > > WARN_ON_ONCE(rxq->xdp_save_va); > > /* Save for reuse */ > > @@ -1854,37 +1730,50 @@ static void mana_rx_skb(void *buf_va, bool > > from_pool, > > return; > > } > > > > -static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev, > > - dma_addr_t *da, bool *from_pool) > > +static void *mana_get_rxfrag(struct mana_rxq *rxq, > > + struct device *dev, dma_addr_t *da, bool > > *from_pool) > > { > > struct page *page; > > + u32 offset; > > void *va; > > - > > *from_pool = false; > > > > - /* Reuse XDP dropped page if available */ > > - if (rxq->xdp_save_va) { > > - va = rxq->xdp_save_va; > > - rxq->xdp_save_va = NULL; > > - } else { > > - page = page_pool_dev_alloc_pages(rxq->page_pool); > > - if (!page) > > + /* Don't use fragments for jumbo frames or XDP (i.e when fragment = 1 > > per page) */ > > + if (rxq->frag_count == 1) { > > + /* Reuse XDP dropped page if available */ > > + if (rxq->xdp_save_va) { > > + va = rxq->xdp_save_va; > > + rxq->xdp_save_va = NULL; > > + } else { > > + page = page_pool_dev_alloc_pages(rxq->page_pool); > > + if (!page) > > + return NULL; > > + > > + *from_pool = true; > > + va = page_to_virt(page); > > + } > > + > > + *da = dma_map_single(dev, va + rxq->headroom, rxq->datasize, > > + DMA_FROM_DEVICE); > > + if (dma_mapping_error(dev, *da)) { > > + if (*from_pool) > > + page_pool_put_full_page(rxq->page_pool, page, > > false); > > + else > > + put_page(virt_to_head_page(va)); > > + > > return NULL; > > + } > > > > - *from_pool = true; > > - va = page_to_virt(page); > > + return va; > > } > > > > - *da = dma_map_single(dev, va + rxq->headroom, rxq->datasize, > > - DMA_FROM_DEVICE); > > - if (dma_mapping_error(dev, *da)) { > > - if (*from_pool) > > - page_pool_put_full_page(rxq->page_pool, page, false); > > - else > > - put_page(virt_to_head_page(va)); > > - > > + page = page_pool_dev_alloc_frag(rxq->page_pool, &offset, > > rxq->alloc_size); > > + if (!page) > > return NULL; > > - } > > + > > + va = page_to_virt(page) + offset; > > + *da = page_pool_get_dma_addr(page) + offset + rxq->headroom; > > + *from_pool = true; > > > > return va; > > } > > @@ -1901,9 +1790,9 @@ static void mana_refill_rx_oob(struct device *dev, > > struct mana_rxq *rxq, > > va = mana_get_rxfrag(rxq, dev, &da, &from_pool); > > if (!va) > > return; > > - > > - dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize, > > - DMA_FROM_DEVICE); > > + if (!rxoob->from_pool || rxq->frag_count == 1) > > + dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize, > > + DMA_FROM_DEVICE); > > *old_buf = rxoob->buf_va; > > *old_fp = rxoob->from_pool; > > > > @@ -2314,15 +2203,19 @@ static void mana_destroy_rxq(struct > > mana_port_context *apc, > > if (!rx_oob->buf_va) > > continue; > > > > - dma_unmap_single(dev, rx_oob->sgl[0].address, > > - rx_oob->sgl[0].size, DMA_FROM_DEVICE); > > - > > page = virt_to_head_page(rx_oob->buf_va); > > > > - if (rx_oob->from_pool) > > - page_pool_put_full_page(rxq->page_pool, page, false); > > - else > > - put_page(page); > > + if (rxq->frag_count == 1) { > > + dma_unmap_single(dev, rx_oob->sgl[0].address, > > rx_oob->sgl[0].size, > > + DMA_FROM_DEVICE); > > + > > + if (rx_oob->from_pool) > > + page_pool_put_full_page(rxq->page_pool, page, > > false); > > + else > > + put_page(page); > > + } else { > > + page_pool_free_va(rxq->page_pool, rx_oob->buf_va, true); > > + } > > > > rx_oob->buf_va = NULL; > > } > > @@ -2338,16 +2231,11 @@ static void mana_destroy_rxq(struct > > mana_port_context *apc, > > static int mana_fill_rx_oob(struct mana_recv_buf_oob *rx_oob, u32 mem_key, > > struct mana_rxq *rxq, struct device *dev) > > { > > - struct mana_port_context *mpc = netdev_priv(rxq->ndev); > > bool from_pool = false; > > dma_addr_t da; > > void *va; > > > > - if (mpc->rxbufs_pre) > > - va = mana_get_rxbuf_pre(rxq, &da); > > - else > > - va = mana_get_rxfrag(rxq, dev, &da, &from_pool); > > - > > + va = mana_get_rxfrag(rxq, dev, &da, &from_pool); > > if (!va) > > return -ENOMEM; > > > > @@ -2428,11 +2316,22 @@ static int mana_create_page_pool(struct mana_rxq > > *rxq, struct gdma_context *gc) > > struct page_pool_params pprm = {}; > > int ret; > > > > - pprm.pool_size = mpc->rx_queue_size; > > + pprm.pool_size = mpc->rx_queue_size / rxq->frag_count + 1; > > pprm.nid = gc->numa_node; > > pprm.napi = &rxq->rx_cq.napi; > > pprm.netdev = rxq->ndev; > > pprm.order = get_order(rxq->alloc_size); > > + pprm.queue_idx = rxq->rxq_idx; > > + pprm.dev = gc->dev; > > + > > + /* Let the page pool do the dma map when page sharing with multiple > > fragments > > + * enabled for rx buffers. > > + */ > > + if (rxq->frag_count > 1) { > > + pprm.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; > > + pprm.max_len = PAGE_SIZE; > > + pprm.dma_dir = DMA_FROM_DEVICE; > > + } > > > > rxq->page_pool = page_pool_create(&pprm); > > > > @@ -2471,9 +2370,8 @@ static struct mana_rxq *mana_create_rxq(struct > > mana_port_context *apc, > > rxq->rxq_idx = rxq_idx; > > rxq->rxobj = INVALID_MANA_HANDLE; > > > > - mana_get_rxbuf_cfg(ndev->mtu, &rxq->datasize, &rxq->alloc_size, > > - &rxq->headroom); > > - > > + mana_get_rxbuf_cfg(apc, ndev->mtu, &rxq->datasize, &rxq->alloc_size, > > + &rxq->headroom, &rxq->frag_count); > > /* Create page pool for RX queue */ > > err = mana_create_page_pool(rxq, gc); > > if (err) { > > diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c > > b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c > > index a1afa75a9463..7ede03c74fb9 100644 > > --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c > > +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c > > @@ -396,12 +396,6 @@ static int mana_set_channels(struct net_device *ndev, > > unsigned int old_count = apc->num_queues; > > int err; > > > > - err = mana_pre_alloc_rxbufs(apc, ndev->mtu, new_count); > > - if (err) { > > - netdev_err(ndev, "Insufficient memory for new allocations"); > > - return err; > > - } > > - > > err = mana_detach(ndev, false); > > if (err) { > > netdev_err(ndev, "mana_detach failed: %d\n", err); > > @@ -416,7 +410,6 @@ static int mana_set_channels(struct net_device *ndev, > > } > > > > out: > > - mana_pre_dealloc_rxbufs(apc); > > return err; > > } > > > > @@ -465,12 +458,7 @@ static int mana_set_ringparam(struct net_device *ndev, > > > > /* pre-allocating new buffers to prevent failures in mana_attach() > > later */ > > apc->rx_queue_size = new_rx; > > - err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues); > > apc->rx_queue_size = old_rx; > > - if (err) { > > - netdev_err(ndev, "Insufficient memory for new allocations\n"); > > - return err; > > - } > > > > err = mana_detach(ndev, false); > > if (err) { > > @@ -488,7 +476,6 @@ static int mana_set_ringparam(struct net_device *ndev, > > apc->rx_queue_size = old_rx; > > } > > out: > > - mana_pre_dealloc_rxbufs(apc); > > return err; > > } > > > > diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h > > index e1030a7d2daa..99a3847b0f9d 100644 > > --- a/include/net/mana/mana.h > > +++ b/include/net/mana/mana.h > > @@ -65,6 +65,8 @@ enum TRI_STATE { > > #define MANA_STATS_RX_COUNT 5 > > #define MANA_STATS_TX_COUNT 11 > > > > +#define MANA_RX_FRAG_ALIGNMENT 64 > > + > > struct mana_stats_rx { > > u64 packets; > > u64 bytes; > > @@ -328,6 +330,7 @@ struct mana_rxq { > > u32 datasize; > > u32 alloc_size; > > u32 headroom; > > + u32 frag_count; > > > > mana_handle_t rxobj; > > > > @@ -503,14 +506,6 @@ struct mana_port_context { > > /* This points to an array of num_queues of RQ pointers. */ > > struct mana_rxq **rxqs; > > > > - /* pre-allocated rx buffer array */ > > - void **rxbufs_pre; > > - dma_addr_t *das_pre; > > - int rxbpre_total; > > - u32 rxbpre_datasize; > > - u32 rxbpre_alloc_size; > > - u32 rxbpre_headroom; > > - > > struct bpf_prog *bpf_prog; > > > > /* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */ > > @@ -574,8 +569,6 @@ int mana_query_link_cfg(struct mana_port_context *apc); > > int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed, > > int enable_clamping); > > void mana_query_phy_stats(struct mana_port_context *apc); > > -int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int > > num_queues); > > -void mana_pre_dealloc_rxbufs(struct mana_port_context *apc); > > > > extern const struct ethtool_ops mana_ethtool_ops; > > extern struct dentry *mana_debugfs_root; > > -- > > 2.43.0 > > > Rest looks good. After fixing above, > Reviewed-by: Saurabh Sengar <ssen...@linux.microsoft.com>