On Thu, Jun 04, 2026 at 11:40:30AM -0700, Jacob Keller wrote:
> On 6/2/2026 1:24 PM, Dipayaan Roy wrote:
> > On some ARM64 platforms with 4K PAGE_SIZE, page_pool fragment
> > allocation in the RX refill path can cause 15-20% throughput
> > regression under high connection counts (>16 TCP streams).
> >
> > Add an ethtool private flag "full-page-rx" that allows the user to
> > force one RX buffer per page, bypassing the page_pool fragment path.
> > This restores line-rate (180+ Gbps) performance on affected platforms.
> >
> > Usage:
> > ethtool --set-priv-flags eth0 full-page-rx on
> >
> > There is no behavioral change by default. The flag must be explicitly
> > enabled by the user or udev rule.
> >
> > The existing single-buffer-per-page logic for XDP and jumbo frames is
> > consolidated into a new helper mana_use_single_rxbuf_per_page() which
> > is now the single decision point for both the automatic and
> > user-controlled paths.
> >
> > Signed-off-by: Dipayaan Roy <[email protected]>
> > ---
>
> I had one or two minor nits, but nothing that I think really deserves a
> v11. The only real comment is a future "gotcha" that could happen if you
> ever added a second private flag, which seems unlikely and maybe not
> worth dealing with until it matters.
>
> Reviewed-by: Jacob Keller <[email protected]>
>
Hi Jacob,
Thank you for the review.
I will keep this patch as is, since no plans for any new private flags.
Regards
Dipayaan Roy
> > drivers/net/ethernet/microsoft/mana/mana_en.c | 22 +++-
> > .../ethernet/microsoft/mana/mana_ethtool.c | 103 ++++++++++++++++++
> > include/net/mana/mana.h | 8 ++
> > 3 files changed, 131 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c
> > b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > index db14357d3732..447cecfd3f67 100644
> > --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> > +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > @@ -744,6 +744,25 @@ static void *mana_get_rxbuf_pre(struct mana_rxq *rxq,
> > dma_addr_t *da)
> > return va;
> > }
> >
> > +static bool
> > +mana_use_single_rxbuf_per_page(struct mana_port_context *apc, u32 mtu)
> > +{
> > + /* On some platforms with 4K PAGE_SIZE, page_pool fragment allocation
> > + * in the RX refill path (~2kB buffer) can cause significant throughput
> > + * regression under high connection counts. Allow user to force one RX
> > + * buffer per page via ethtool private flag to bypass the fragment
> > + * path.
> > + */
> > + if (apc->priv_flags & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF))
> > + return true;
> > +
> > + /* For xdp and jumbo frames make sure only one packet fits per page. */
> > + if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc))
> > + return true;
>
> Technically you could combine all three into one if, but I agree that
> clarity and space for the comment about why the private flag exists
> makes sense.
>
> > +
> > + return false;
> > +}
> > +
> > /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */
> > static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
> > int mtu, u32 *datasize, u32 *alloc_size,
> > @@ -754,8 +773,7 @@ static void mana_get_rxbuf_cfg(struct mana_port_context
> > *apc,
> > /* Calculate datasize first (consistent across all cases) */
> > *datasize = mtu + ETH_HLEN;
> >
> > - /* For xdp and jumbo frames make sure only one packet fits per page */
> > - if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) {
> > + if (mana_use_single_rxbuf_per_page(apc, mtu)) {
> > if (mana_xdp_get(apc)) {
> > *headroom = XDP_PACKET_HEADROOM;
> > *alloc_size = PAGE_SIZE;
> > diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> > b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> > index 7e79681634db..f22bbb325948 100644
> > --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> > +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> > @@ -133,6 +133,10 @@ static const struct mana_stats_desc mana_phy_stats[] =
> > {
> > { "hc_tc7_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats,
> > tx_pause_tc7_phy) },
> > };
> >
> > +static const char mana_priv_flags[MANA_PRIV_FLAG_MAX][ETH_GSTRING_LEN] = {
> > + [MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF] = "full-page-rx"
> > +};
> > +
> > static int mana_get_sset_count(struct net_device *ndev, int stringset)
> > {
> > struct mana_port_context *apc = netdev_priv(ndev);
> > @@ -144,6 +148,10 @@ static int mana_get_sset_count(struct net_device
> > *ndev, int stringset)
> > ARRAY_SIZE(mana_phy_stats) +
> > ARRAY_SIZE(mana_hc_stats) +
> > num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
> > +
> > + case ETH_SS_PRIV_FLAGS:
> > + return MANA_PRIV_FLAG_MAX;
> > +
> > default:
> > return -EINVAL;
> > }
> > @@ -192,6 +200,14 @@ static void mana_get_strings_stats(struct
> > mana_port_context *apc, u8 **data)
> > }
> > }
> >
> > +static void mana_get_strings_priv_flags(u8 **data)
> > +{
> > + int i;
> > +
> > + for (i = 0; i < MANA_PRIV_FLAG_MAX; i++)
> > + ethtool_puts(data, mana_priv_flags[i]);
> > +}
> > +
> > static void mana_get_strings(struct net_device *ndev, u32 stringset, u8
> > *data)
> > {
> > struct mana_port_context *apc = netdev_priv(ndev);
> > @@ -200,6 +216,9 @@ static void mana_get_strings(struct net_device *ndev,
> > u32 stringset, u8 *data)
> > case ETH_SS_STATS:
> > mana_get_strings_stats(apc, &data);
> > break;
> > + case ETH_SS_PRIV_FLAGS:
> > + mana_get_strings_priv_flags(&data);
> > + break;
> > default:
> > break;
> > }
> > @@ -590,6 +609,88 @@ static int mana_get_link_ksettings(struct net_device
> > *ndev,
> > return 0;
> > }
> >
> > +static u32 mana_get_priv_flags(struct net_device *ndev)
> > +{
> > + struct mana_port_context *apc = netdev_priv(ndev);
> > +
> > + return apc->priv_flags;
> > +}
> > +
> > +static int mana_set_priv_flags(struct net_device *ndev, u32 priv_flags)
> > +{
> > + struct mana_port_context *apc = netdev_priv(ndev);
> > + u32 changed = apc->priv_flags ^ priv_flags;
> > + u32 old_priv_flags = apc->priv_flags;
> > + bool schedule_port_reset = false;
> > + int err = 0;
> > +
> > + if (!changed)
> > + return 0;
> > +
> > + /* Reject unknown bits */
> > + if (priv_flags & ~GENMASK(MANA_PRIV_FLAG_MAX - 1, 0))
> > + return -EINVAL;
>
> Good. Explicit rejection ensures that there's no risk of bad value. I
> think this is only required for the legacy ioctl interface, and won't be
> able to have a bit set that isn't in your accepted list. However the
> legacy ioctl interface looks like it doesn't do that double checking, so
> its good to have this.
>
> > +
> > + if (changed & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF)) {
> > + apc->priv_flags = priv_flags;
> > +
>
> In the (unlikely) event that you need another private flag in the
> future, this bit seems like it shouldn't be inside the if block here. It
> seems like you'd want to either do this at the end or up front. Of
> course it doesn't matter as long as this is the only private flag you have.
>
> > + if (!apc->port_is_up) {
> > + /* Port is down, flag updated to apply on next up
> > + * so just return.
> > + */
> > + return 0;
> > + }
> > +
> > + /* Pre-allocate buffers to prevent failure in mana_attach
> > + * later
> > + */
> > + err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues);
> > + if (err) {
> > + netdev_err(ndev,
> > + "Insufficient memory for new allocations\n");
> > + apc->priv_flags = old_priv_flags;
> > + return err;
> > + }
> > +
> > + err = mana_detach(ndev, false);
> > + if (err) {
> > + netdev_err(ndev, "mana_detach failed: %d\n", err);
> > + apc->priv_flags = old_priv_flags;
> > +
> > + /* Port is in an inconsistent state. Restore
> > + * 'port_is_up' so that queue reset work handler
> > + * can properly detach and re-attach.
> > + */
> > + apc->port_is_up = true;
> > + schedule_port_reset = true;
> > + goto out;
> > + }
> > +
> > + err = mana_attach(ndev);
> > + if (err) {
> > + netdev_err(ndev, "mana_attach failed: %d\n", err);
> > + apc->priv_flags = old_priv_flags;
> > +
> > + /* Restore 'port_is_up' so the reset work handler
> > + * can properly detach/attach. Without this,
> > + * the handler sees port_is_up=false and skips
> > + * queue allocation, leaving the port dead.
> > + */
> > + apc->port_is_up = true;
> > + schedule_port_reset = true;
> > + }
>
> I might have made this bit a separate function, but that comes from
> history of working with older drivers which accumulated a larger number
> of private flags. Given that we frown on adding new ones except in more
> rare cases these days, this is probably fine.
>
> > + }
> > +
> > +out:
> > + mana_pre_dealloc_rxbufs(apc);
> > +
> > + if (schedule_port_reset)
> > + queue_work(apc->ac->per_port_queue_reset_wq,
> > + &apc->queue_reset_work);
> > +
> > + return err;
> > +}
> > +
> > const struct ethtool_ops mana_ethtool_ops = {
> > .supported_coalesce_params = ETHTOOL_COALESCE_RX_CQE_FRAMES,
> > .get_ethtool_stats = mana_get_ethtool_stats,
> > @@ -608,4 +709,6 @@ const struct ethtool_ops mana_ethtool_ops = {
> > .set_ringparam = mana_set_ringparam,
> > .get_link_ksettings = mana_get_link_ksettings,
> > .get_link = ethtool_op_get_link,
> > + .get_priv_flags = mana_get_priv_flags,
> > + .set_priv_flags = mana_set_priv_flags,
> > };
> > diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
> > index d9c27310fd04..26fd5e041a47 100644
> > --- a/include/net/mana/mana.h
> > +++ b/include/net/mana/mana.h
> > @@ -30,6 +30,12 @@ enum TRI_STATE {
> > TRI_STATE_TRUE = 1
> > };
> >
> > +/* MANA ethtool private flag bit positions */
> > +enum mana_priv_flag_bits {
> > + MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF = 0,
> > + MANA_PRIV_FLAG_MAX,
>
> For cases like this, I find it helpful to add a comment indicating this
> must be the last entry. (and in that case, drop the trailing comma).
>
> > +};
> > +
> > /* Number of entries for hardware indirection table must be in power of 2
> > */
> > #define MANA_INDIRECT_TABLE_MAX_SIZE 512
> > #define MANA_INDIRECT_TABLE_DEF_SIZE 64
> > @@ -531,6 +537,8 @@ struct mana_port_context {
> > u32 rxbpre_headroom;
> > u32 rxbpre_frag_count;
> >
> > + u32 priv_flags;
> > +
> > struct bpf_prog *bpf_prog;
> >
> > /* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
>