On some ARM64 platforms with 4K PAGE_SIZE, page_pool fragment
allocation in the RX refill path can cause 15-20% throughput
regression under high connection counts (>16 TCP streams).

Add an ethtool private flag "full-page-rx" that allows the user to
force one RX buffer per page, bypassing the page_pool fragment path.
This restores line-rate(180+ Gbps) performance on affected platforms.

Usage:
  ethtool --set-priv-flags eth0 full-page-rx on

There is no behavioral change by default. The flag must be explicitly
enabled by the user or udev rule.

The existing single-buffer-per-page logic for XDP and jumbo frames is
consolidated into a new helper mana_use_single_rxbuf_per_page().

Signed-off-by: Dipayaan Roy <[email protected]>
---
Changes in v4:
  - Dropping the smbios string parsing and add ethtool priv flag
    to reconfigure the queues with full page rx buffers.
Changes in v3:
  - changed u8* to char*
Changes in v2:
  - separate reading string index and the string, remove inline.
---
 drivers/net/ethernet/microsoft/mana/mana_en.c |  22 ++-
 .../ethernet/microsoft/mana/mana_ethtool.c    | 159 +++++++++++++++---
 include/net/mana/mana.h                       |   8 +
 3 files changed, 159 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c 
b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 49c65cc1697c..59a1626c2be1 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -744,6 +744,25 @@ static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, 
dma_addr_t *da)
        return va;
 }
 
+static bool
+mana_use_single_rxbuf_per_page(struct mana_port_context *apc, u32 mtu)
+{
+       /* On some platforms with 4K PAGE_SIZE, page_pool fragment allocation
+        * in the RX refill path (~2kB buffer) can cause significant throughput
+        * regression under high connection counts. Allow user to force one RX
+        * buffer per page via ethtool private flag to bypass the fragment
+        * path.
+        */
+       if (apc->priv_flags & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF))
+               return true;
+
+       /* For xdp and jumbo frames make sure only one packet fits per page. */
+       if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc))
+               return true;
+
+       return false;
+}
+
 /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */
 static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
                               int mtu, u32 *datasize, u32 *alloc_size,
@@ -754,8 +773,7 @@ static void mana_get_rxbuf_cfg(struct mana_port_context 
*apc,
        /* Calculate datasize first (consistent across all cases) */
        *datasize = mtu + ETH_HLEN;
 
-       /* For xdp and jumbo frames make sure only one packet fits per page */
-       if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) {
+       if (mana_use_single_rxbuf_per_page(apc, mtu)) {
                if (mana_xdp_get(apc)) {
                        *headroom = XDP_PACKET_HEADROOM;
                        *alloc_size = PAGE_SIZE;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c 
b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 6a4b42fe0944..9f7393b71a34 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -133,58 +133,91 @@ static const struct mana_stats_desc mana_phy_stats[] = {
        { "hc_tc7_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, 
tx_pause_tc7_phy) },
 };
 
+static const char mana_priv_flags[MANA_PRIV_FLAG_MAX][ETH_GSTRING_LEN] = {
+       [MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF] = "full-page-rx"
+};
+
 static int mana_get_sset_count(struct net_device *ndev, int stringset)
 {
        struct mana_port_context *apc = netdev_priv(ndev);
        unsigned int num_queues = apc->num_queues;
 
-       if (stringset != ETH_SS_STATS)
+       switch (stringset) {
+       case ETH_SS_STATS:
+               return ARRAY_SIZE(mana_eth_stats) +
+                      ARRAY_SIZE(mana_phy_stats) +
+                      ARRAY_SIZE(mana_hc_stats)  +
+                      num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
+       case ETH_SS_PRIV_FLAGS:
+               return MANA_PRIV_FLAG_MAX;
+       default:
                return -EINVAL;
+       }
+}
+
+static void mana_get_strings_priv_flags(u8 **data)
+{
+       int i;
 
-       return ARRAY_SIZE(mana_eth_stats) + ARRAY_SIZE(mana_phy_stats) + 
ARRAY_SIZE(mana_hc_stats) +
-                       num_queues * (MANA_STATS_RX_COUNT + 
MANA_STATS_TX_COUNT);
+       for (i = 0; i < MANA_PRIV_FLAG_MAX; i++)
+               ethtool_puts(data, mana_priv_flags[i]);
 }
 
-static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
+static void mana_get_strings_stats(struct mana_port_context *apc, u8 **data)
 {
-       struct mana_port_context *apc = netdev_priv(ndev);
        unsigned int num_queues = apc->num_queues;
        int i, j;
 
-       if (stringset != ETH_SS_STATS)
-               return;
        for (i = 0; i < ARRAY_SIZE(mana_eth_stats); i++)
-               ethtool_puts(&data, mana_eth_stats[i].name);
+               ethtool_puts(data, mana_eth_stats[i].name);
 
        for (i = 0; i < ARRAY_SIZE(mana_hc_stats); i++)
-               ethtool_puts(&data, mana_hc_stats[i].name);
+               ethtool_puts(data, mana_hc_stats[i].name);
 
        for (i = 0; i < ARRAY_SIZE(mana_phy_stats); i++)
-               ethtool_puts(&data, mana_phy_stats[i].name);
+               ethtool_puts(data, mana_phy_stats[i].name);
 
        for (i = 0; i < num_queues; i++) {
-               ethtool_sprintf(&data, "rx_%d_packets", i);
-               ethtool_sprintf(&data, "rx_%d_bytes", i);
-               ethtool_sprintf(&data, "rx_%d_xdp_drop", i);
-               ethtool_sprintf(&data, "rx_%d_xdp_tx", i);
-               ethtool_sprintf(&data, "rx_%d_xdp_redirect", i);
-               ethtool_sprintf(&data, "rx_%d_pkt_len0_err", i);
+               ethtool_sprintf(data, "rx_%d_packets", i);
+               ethtool_sprintf(data, "rx_%d_bytes", i);
+               ethtool_sprintf(data, "rx_%d_xdp_drop", i);
+               ethtool_sprintf(data, "rx_%d_xdp_tx", i);
+               ethtool_sprintf(data, "rx_%d_xdp_redirect", i);
+               ethtool_sprintf(data, "rx_%d_pkt_len0_err", i);
                for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++)
-                       ethtool_sprintf(&data, "rx_%d_coalesced_cqe_%d", i, j + 
2);
+                       ethtool_sprintf(data,
+                                       "rx_%d_coalesced_cqe_%d",
+                                       i,
+                                       j + 2);
        }
 
        for (i = 0; i < num_queues; i++) {
-               ethtool_sprintf(&data, "tx_%d_packets", i);
-               ethtool_sprintf(&data, "tx_%d_bytes", i);
-               ethtool_sprintf(&data, "tx_%d_xdp_xmit", i);
-               ethtool_sprintf(&data, "tx_%d_tso_packets", i);
-               ethtool_sprintf(&data, "tx_%d_tso_bytes", i);
-               ethtool_sprintf(&data, "tx_%d_tso_inner_packets", i);
-               ethtool_sprintf(&data, "tx_%d_tso_inner_bytes", i);
-               ethtool_sprintf(&data, "tx_%d_long_pkt_fmt", i);
-               ethtool_sprintf(&data, "tx_%d_short_pkt_fmt", i);
-               ethtool_sprintf(&data, "tx_%d_csum_partial", i);
-               ethtool_sprintf(&data, "tx_%d_mana_map_err", i);
+               ethtool_sprintf(data, "tx_%d_packets", i);
+               ethtool_sprintf(data, "tx_%d_bytes", i);
+               ethtool_sprintf(data, "tx_%d_xdp_xmit", i);
+               ethtool_sprintf(data, "tx_%d_tso_packets", i);
+               ethtool_sprintf(data, "tx_%d_tso_bytes", i);
+               ethtool_sprintf(data, "tx_%d_tso_inner_packets", i);
+               ethtool_sprintf(data, "tx_%d_tso_inner_bytes", i);
+               ethtool_sprintf(data, "tx_%d_long_pkt_fmt", i);
+               ethtool_sprintf(data, "tx_%d_short_pkt_fmt", i);
+               ethtool_sprintf(data, "tx_%d_csum_partial", i);
+               ethtool_sprintf(data, "tx_%d_mana_map_err", i);
+       }
+}
+
+static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
+{
+       struct mana_port_context *apc = netdev_priv(ndev);
+
+       switch (stringset) {
+       case ETH_SS_PRIV_FLAGS:
+               mana_get_strings_priv_flags(&data);
+               break;
+
+       case ETH_SS_STATS:
+               mana_get_strings_stats(apc, &data);
+               break;
        }
 }
 
@@ -573,6 +606,74 @@ static int mana_get_link_ksettings(struct net_device *ndev,
        return 0;
 }
 
+static u32 mana_get_priv_flags(struct net_device *ndev)
+{
+       struct mana_port_context *apc = netdev_priv(ndev);
+
+       return apc->priv_flags;
+}
+
+static int mana_set_priv_flags(struct net_device *ndev, u32 priv_flags)
+{
+       struct mana_port_context *apc = netdev_priv(ndev);
+       u32 changed = apc->priv_flags ^ priv_flags;
+       u32 old_priv_flags = apc->priv_flags;
+       bool schedule_port_reset = false;
+       int err = 0;
+
+       if (!changed)
+               return 0;
+
+       /* Reject unknown bits */
+       if (priv_flags & ~GENMASK(MANA_PRIV_FLAG_MAX - 1, 0))
+               return -EINVAL;
+
+       if (changed & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF)) {
+               apc->priv_flags = priv_flags;
+
+               if (!apc->port_is_up) {
+                       /* Port is down, flag updated to apply on next up
+                        * so just return.
+                        */
+                       return 0;
+               }
+
+               /* Pre-allocate buffers to prevent failure in mana_attach
+                * later
+                */
+               err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues);
+               if (err) {
+                       netdev_err(ndev,
+                                  "Insufficient memory for new allocations\n");
+                       apc->priv_flags = old_priv_flags;
+                       return err;
+               }
+
+               err = mana_detach(ndev, false);
+               if (err) {
+                       netdev_err(ndev, "mana_detach failed: %d\n", err);
+                       apc->priv_flags = old_priv_flags;
+                       goto out;
+               }
+
+               err = mana_attach(ndev);
+               if (err) {
+                       netdev_err(ndev, "mana_attach failed: %d\n", err);
+                       apc->priv_flags = old_priv_flags;
+                       schedule_port_reset = true;
+               }
+       }
+
+out:
+       mana_pre_dealloc_rxbufs(apc);
+
+       if (err && schedule_port_reset)
+               queue_work(apc->ac->per_port_queue_reset_wq,
+                          &apc->queue_reset_work);
+
+       return err;
+}
+
 const struct ethtool_ops mana_ethtool_ops = {
        .supported_coalesce_params = ETHTOOL_COALESCE_RX_CQE_FRAMES,
        .get_ethtool_stats      = mana_get_ethtool_stats,
@@ -591,4 +692,6 @@ const struct ethtool_ops mana_ethtool_ops = {
        .set_ringparam          = mana_set_ringparam,
        .get_link_ksettings     = mana_get_link_ksettings,
        .get_link               = ethtool_op_get_link,
+       .get_priv_flags         = mana_get_priv_flags,
+       .set_priv_flags         = mana_set_priv_flags,
 };
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 3336688fed5e..fd87e3d6c1f4 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -30,6 +30,12 @@ enum TRI_STATE {
        TRI_STATE_TRUE = 1
 };
 
+/* MANA ethtool private flag bit positions */
+enum mana_priv_flag_bits {
+       MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF = 0,
+       MANA_PRIV_FLAG_MAX,
+};
+
 /* Number of entries for hardware indirection table must be in power of 2 */
 #define MANA_INDIRECT_TABLE_MAX_SIZE 512
 #define MANA_INDIRECT_TABLE_DEF_SIZE 64
@@ -531,6 +537,8 @@ struct mana_port_context {
        u32 rxbpre_headroom;
        u32 rxbpre_frag_count;
 
+       u32 priv_flags;
+
        struct bpf_prog *bpf_prog;
 
        /* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
-- 
2.43.0


Reply via email to