The NetDIM library, currently leveraged by an array of NICs, delivers
excellent acceleration benefits. Nevertheless, NICs vary significantly
in their dim profile list prerequisites.

Specifically, virtio-net backends may present diverse sw or hw device
implementation, making a one-size-fits-all parameter list impractical.
On Alibaba Cloud, the virtio DPU's performance under the default DIM
profile falls short of expectations, partly due to a mismatch in
parameter configuration.

I also noticed that ice/idpf/ena and other NICs have customized
profilelist or placed some restrictions on dim capabilities.

Motivated by this, I tried adding new sysfs attributes that provides
a per-device control to modify and access a device's interrupt parameters.

Usage
========
1. Query the currently customized list of the device

$ cat dim_profs
The profiles of (RX, EQE):
{.usec =   1, .pkts = 256, .comps =   0,},
{.usec =   8, .pkts = 256, .comps =   0,},
{.usec =  64, .pkts = 256, .comps =   0,},
{.usec = 128, .pkts = 256, .comps =   0,},
{.usec = 256, .pkts = 256, .comps =   0,}
The profiles of (TX, EQE):
{.usec =   1, .pkts = 256, .comps =   0,},
{.usec =   2, .pkts = 256, .comps =   0,},
{.usec =   3, .pkts = 256, .comps =   0,},
{.usec =   4, .pkts = 256, .comps =   0,},
{.usec =   5, .pkts = 256, .comps =   0,}

2. Tune

$ echo "RX EQE 8,8,0 16,16,0 32,32,0 64,64,0 128,128,0" > dim_profs
$ echo "  TX  EQE 0,2,0   1,3,0 2,4,0   3,5,0  4,6,0   " > dim_profs
$ cat dim_profs
The profiles of (RX, EQE):
{.usec =   8, .pkts =   8, .comps =   0,},
{.usec =  16, .pkts =  16, .comps =   0,},
{.usec =  32, .pkts =  32, .comps =   0,},
{.usec =  64, .pkts =  64, .comps =   0,},
{.usec = 128, .pkts = 128, .comps =   0,}
The profiles of (TX, EQE):
{.usec =   0, .pkts =   2, .comps =   0,},
{.usec =   1, .pkts =   3, .comps =   0,},
{.usec =   2, .pkts =   4, .comps =   0,},
{.usec =   3, .pkts =   5, .comps =   0,},
{.usec =   4, .pkts =   6, .comps =   0,}

3. Warn
If the device does not support .ndo_dim_moder_{set, get},
the following warning will response:
"Profile is default and not customized by the device."

Signed-off-by: Heng Qi <hen...@linux.alibaba.com>
---
 Documentation/ABI/testing/sysfs-class-net |  17 +++
 include/linux/dim.h                       |   7 ++
 include/linux/netdevice.h                 |  35 ++++++
 lib/dim/net_dim.c                         |   6 --
 net/core/net-sysfs.c                      | 172 ++++++++++++++++++++++++++++++
 5 files changed, 231 insertions(+), 6 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-class-net 
b/Documentation/ABI/testing/sysfs-class-net
index ebf21be..1e4faa8 100644
--- a/Documentation/ABI/testing/sysfs-class-net
+++ b/Documentation/ABI/testing/sysfs-class-net
@@ -352,3 +352,20 @@ Description:
                0  threaded mode disabled for this dev
                1  threaded mode enabled for this dev
                == ==================================
+
+What:          /sys/class/net/<iface>/dim_profs
+Date:          Mar 2024
+KernelVersion: 6.8
+Contact:       net...@vger.kernel.org
+Description:
+               String value to control the profile list of DIM per device. 
User could
+               set this value to tune the profile list for RX/TX direction and 
EQE/CQE
+               mode respectively.
+
+               Possible values:
+               ================================================ 
==========================
+               RX EQE 1,1,0  2,2,0   3,3,0   4,4,0    5,5,0     tune RX + EQE 
profile list
+               RX CQE 8,8,0  16,16,0 32,32,0 64,64,0  128,128,0 tune RX + CQE 
profile list
+               TX EQE 16,8,0 2,16,0  16,8,0  32,64,0  128,64,0  tune TX + EQE 
profile list
+               TX CQE 8,5,0  8,16,0  32,12,0 128,64,0 256,128,0 tune TX + CQE 
profile list
+               ================================================ 
==========================
diff --git a/include/linux/dim.h b/include/linux/dim.h
index f343bc9..43398f5 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -10,6 +10,13 @@
 #include <linux/types.h>
 #include <linux/workqueue.h>
 
+/* Number of DIM profiles and period mode. */
+#define NET_DIM_PARAMS_NUM_PROFILES 5
+#define NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE 256
+#define NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE 128
+#define NET_DIM_DEF_PROFILE_CQE 1
+#define NET_DIM_DEF_PROFILE_EQE 1
+
 /*
  * Number of events between DIM iterations.
  * Causes a moderation of the algorithm run.
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c6f6ac7..bc2f3ac 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -49,6 +49,7 @@
 #include <uapi/linux/netdev.h>
 #include <linux/hashtable.h>
 #include <linux/rbtree.h>
+#include <linux/dim.h>
 #include <net/net_trackers.h>
 #include <net/net_debug.h>
 #include <net/dropreason-core.h>
@@ -998,6 +999,27 @@ struct netdev_net_notifier {
        struct notifier_block *nb;
 };
 
+enum dim_direction {
+       DIM_RX_DIRECTION = 0x0,
+       DIM_TX_DIRECTION = 0x1,
+       DIM_NUM_DIRECTIONS
+};
+/**
+ * struct dim_profs_list - Structure for dim sysfs configuration.
+ * Used to exchange profile list between the sysfs and the driver.
+ *
+ * @direction: RX or TX dim information
+ * @mode: CQ period count mode (from CQE/EQE)
+ * @num: the number of profs array
+ * @profs: dim profile list
+ */
+struct dim_profs_list {
+       u8 direction;
+       u8 mode;
+       u8 num;
+       struct dim_cq_moder profs[];
+};
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -1351,6 +1373,14 @@ struct netdev_net_notifier {
  *                        struct kernel_hwtstamp_config *kernel_config,
  *                        struct netlink_ext_ack *extack);
  *     Change the hardware timestamping parameters for NIC device.
+ *
+ * int (*ndo_dim_moder_get)(struct net_device *dev,
+ *                         struct dim_profs_list *list);
+ *     Get dim profiles list from the NIC device.
+ *
+ * int (*ndo_dim_moder_set)(struct net_device *dev,
+ *                         struct dim_profs_list *list);
+ *     Configure dim profiles list for the NIC device.
  */
 struct net_device_ops {
        int                     (*ndo_init)(struct net_device *dev);
@@ -1595,6 +1625,11 @@ struct net_device_ops {
        int                     (*ndo_hwtstamp_set)(struct net_device *dev,
                                                    struct 
kernel_hwtstamp_config *kernel_config,
                                                    struct netlink_ext_ack 
*extack);
+       int                     (*ndo_dim_moder_get)(struct net_device *dev,
+                                                    struct dim_profs_list 
*list);
+
+       int                     (*ndo_dim_moder_set)(struct net_device *dev,
+                                                    struct dim_profs_list 
*list);
 };
 
 /**
diff --git a/lib/dim/net_dim.c b/lib/dim/net_dim.c
index 4e32f7a..67d5beb 100644
--- a/lib/dim/net_dim.c
+++ b/lib/dim/net_dim.c
@@ -11,12 +11,6 @@
  *        There are different set of profiles for RX/TX CQs.
  *        Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES
  */
-#define NET_DIM_PARAMS_NUM_PROFILES 5
-#define NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE 256
-#define NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE 128
-#define NET_DIM_DEF_PROFILE_CQE 1
-#define NET_DIM_DEF_PROFILE_EQE 1
-
 #define NET_DIM_RX_EQE_PROFILES { \
        {.usec = 1,   .pkts = NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE,}, \
        {.usec = 8,   .pkts = NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE,}, \
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e3d7a8c..801cb07 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -23,6 +23,7 @@
 #include <linux/of.h>
 #include <linux/of_net.h>
 #include <linux/cpu.h>
+#include <linux/dim.h>
 #include <net/netdev_rx_queue.h>
 #include <net/rps.h>
 
@@ -638,6 +639,176 @@ static ssize_t threaded_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(threaded);
 
+static struct dim_profs_list *parse_dim_profs(const char *buf, ssize_t len)
+{
+       int i, ret, size, totlen = 0, retlen = 0;
+       char direction[3], period_mode[4];
+       struct dim_profs_list *list;
+
+       size = sizeof(*list) + NET_DIM_PARAMS_NUM_PROFILES * sizeof(struct 
dim_cq_moder);
+       list = kzalloc(size, GFP_KERNEL);
+       if (!list)
+               goto err_list;
+
+       list->num = NET_DIM_PARAMS_NUM_PROFILES;
+
+       ret = sscanf(buf, "%2s %3s%n", direction, period_mode, &retlen);
+       if (ret != 2)
+               goto err_parse;
+
+       if (!strcasecmp(direction, "RX"))
+               list->direction = DIM_RX_DIRECTION;
+       else if (!strcasecmp(direction, "TX"))
+               list->direction = DIM_TX_DIRECTION;
+       else
+               goto err_parse;
+
+       if (!strcasecmp(period_mode, "EQE"))
+               list->mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+       else if (!strcasecmp(period_mode, "CQE"))
+               list->mode = DIM_CQ_PERIOD_MODE_START_FROM_CQE;
+       else
+               goto err_parse;
+
+       totlen += retlen;
+       if (totlen > len)
+               goto err_parse;
+
+       buf += retlen;
+       if (!buf)
+               goto err_parse;
+
+       for (i = 0; i < NET_DIM_PARAMS_NUM_PROFILES; i++) {
+               ret = sscanf(buf, "%hu,%hu,%hu%n", &list->profs[i].usec,
+                            &list->profs[i].pkts, &list->profs[i].comps, 
&retlen);
+               if (ret != 3)
+                       goto err_parse;
+
+               totlen += retlen;
+               if (totlen > len)
+                       goto err_parse;
+
+               buf += retlen;
+               if (i == NET_DIM_PARAMS_NUM_PROFILES - 1)
+                       break;
+       }
+
+       return list;
+
+err_parse:
+       kfree(list);
+err_list:
+       return NULL;
+}
+
+static ssize_t dim_profs_store(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t len)
+{
+       struct net_device *netdev = to_net_dev(dev);
+       const struct net_device_ops *ops = netdev->netdev_ops;
+       struct net *net = dev_net(netdev);
+       struct dim_profs_list *list;
+       int ret = 0;
+
+       if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+               return -EPERM;
+
+       list = parse_dim_profs(buf, len);
+       if (!list)
+               return -EINVAL;
+
+       if (!rtnl_trylock())
+               return restart_syscall();
+
+       if (dev_isalive(netdev)) {
+               if (!ops->ndo_dim_moder_set)
+                       ret = -EINVAL;
+               else
+                       ret = ops->ndo_dim_moder_set(netdev, list) ? : len;
+       }
+
+       kfree(list);
+       rtnl_unlock();
+
+       return ret;
+}
+
+static ssize_t dim_profs_show_one(struct device *dev,
+                                 struct device_attribute *attr,
+                                 char *buf, u8 direct, u8 mode,
+                                 size_t *len_)
+{
+       static const char fmt_body[] = "{.usec = %3hu, .pkts = %3hu, .comps = 
%3hu,}%s";
+       static const char fmt_hdr[] = "The profiles of (%2s, %3s):\n";
+       const char *direction[2] = {"RX", "TX"}, *period_mode[2] = {"EQE", 
"CQE"};
+       struct net_device *netdev = to_net_dev(dev);
+       const struct net_device_ops *ops = netdev->netdev_ops;
+       struct dim_profs_list *list;
+       size_t size, len = *len_;
+       ssize_t i;
+
+       size = sizeof(*list) + NET_DIM_PARAMS_NUM_PROFILES * sizeof(struct 
dim_cq_moder);
+       list = kzalloc(size, GFP_KERNEL);
+       if (!list)
+               return -ENOMEM;
+
+       list->num = NET_DIM_PARAMS_NUM_PROFILES;
+       list->direction = direct;
+       list->mode = mode;
+       if (ops->ndo_dim_moder_get(netdev, list))
+               goto ret_;
+
+       len += scnprintf(buf + len, PAGE_SIZE - len,
+                        fmt_hdr, direction[direct], period_mode[mode]);
+       for (i = 0; i < NET_DIM_PARAMS_NUM_PROFILES; i++) {
+               len += scnprintf(buf + len, PAGE_SIZE - len, fmt_body,
+                               list->profs[i].usec, list->profs[i].pkts,
+                               list->profs[i].comps,
+                               (i == NET_DIM_PARAMS_NUM_PROFILES - 1) ? "\n" : 
",\n");
+       }
+       *len_ = len;
+ret_:
+       kfree(list);
+       return 0;
+}
+
+static ssize_t dim_profs_show(struct device *dev,
+                             struct device_attribute *attr,
+                             char *buf)
+{
+       static const char out[] = "profile is default and not customized by the 
device.";
+       struct net_device *netdev = to_net_dev(dev);
+       const struct net_device_ops *ops = netdev->netdev_ops;
+       ssize_t i, j, ret = 0;
+       size_t len = 0;
+
+       if (!rtnl_trylock())
+               return restart_syscall();
+
+       if (!ops->ndo_dim_moder_get) {
+               ret = sysfs_emit(buf, "%s\n", out);
+               goto ret_;
+       }
+
+       for (i = 0; i < DIM_NUM_DIRECTIONS; i++) {
+               for (j = 0; j < DIM_CQ_PERIOD_NUM_MODES; j++) {
+                       ret = dim_profs_show_one(dev, attr, buf, i, j, &len);
+                       if (ret)
+                               goto ret_;
+               }
+       }
+
+       rtnl_unlock();
+       return len;
+
+ret_:
+       rtnl_unlock();
+       return ret;
+}
+
+static DEVICE_ATTR_RW(dim_profs);
+
 static struct attribute *net_class_attrs[] __ro_after_init = {
        &dev_attr_netdev_group.attr,
        &dev_attr_type.attr,
@@ -671,6 +842,7 @@ static ssize_t threaded_store(struct device *dev,
        &dev_attr_carrier_up_count.attr,
        &dev_attr_carrier_down_count.attr,
        &dev_attr_threaded.attr,
+       &dev_attr_dim_profs.attr,
        NULL,
 };
 ATTRIBUTE_GROUPS(net_class);
-- 
1.8.3.1


Reply via email to