On 7/9/2022 12:49 AM, lon...@linuxonhyperv.com wrote:
CAUTION: This message has originated from an External Source. Please use proper
judgment and caution when opening attachments, clicking links, or responding to
this email.
From: Long Li <lon...@microsoft.com>
MANA is a PCI device. It uses IB verbs to access hardware through the
kernel RDMA layer. This patch introduces build environment and basic
device probe functions.
Signed-off-by: Long Li <lon...@microsoft.com>
---
Change log:
v2:
Fix typos.
Make the driver build only on x86-64 and Linux.
Remove unused header files.
Change port definition to uint16_t or uint8_t (for IB).
Use getline() in place of fgets() to read and truncate a line.
v3:
Add meson build check for required functions from RDMA direct verb header file
v4:
Remove extra "\n" in logging code.
Use "r" in place of "rb" in fopen() to read text files.
<...>
--- /dev/null
+++ b/doc/guides/nics/mana.rst
@@ -0,0 +1,66 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+ Copyright 2022 Microsoft Corporation
+
+MANA poll mode driver library
+=============================
+
+The MANA poll mode driver library (**librte_net_mana**) implements support
+for Microsoft Azure Network Adapter VF in SR-IOV context.
+
Can you please provide any link to an official product description? As a
reference point for anybody interested more with the product details.
<..>
+
+Netvsc PMD arguments > +--------------------
'Netvsc'? Do you mean 'MANA'?
j
+
+The user can specify below argument in devargs.
+
+#. ``mac``:
+
+ Specify the MAC address for this device. If it is set, the driver
+ probes and loads the NIC with a matching mac address. If it is not
+ set, the driver probes on all the NICs on the PCI device. The default
+ value is not set, meaning all the NICs will be probed and loaded.
Code accepts up to 8 mac value, should this be documented?
Also why this devarg is needed?
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
new file mode 100644
index 0000000000..cb59eb6882
--- /dev/null
+++ b/drivers/net/mana/mana.c
@@ -0,0 +1,704 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#include <unistd.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+#include <ethdev_driver.h>
+#include <ethdev_pci.h>
+#include <rte_kvargs.h>
+#include <rte_eal_paging.h>
+
+#include <infiniband/verbs.h>
+#include <infiniband/manadv.h>
+
+#include <assert.h>
+
+#include "mana.h"
+
+/* Shared memory between primary/secondary processes, per driver */
+struct mana_shared_data *mana_shared_data;
+const struct rte_memzone *mana_shared_mz;
If these global variables are not used by other compilation units,
please try to make them static as much as possible.
+static const char *MZ_MANA_SHARED_DATA = "mana_shared_data";
+
+struct mana_shared_data mana_local_data;
+
Can you put some comment to this global variables?
+/* Spinlock for mana_shared_data */
+static rte_spinlock_t mana_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* Allocate a buffer on the stack and fill it with a printf format string. */
+#define MKSTR(name, ...) \
+ int mkstr_size_##name = snprintf(NULL, 0, "" __VA_ARGS__); \
+ char name[mkstr_size_##name + 1]; \
+ \
+ memset(name, 0, mkstr_size_##name + 1); \
+ snprintf(name, sizeof(name), "" __VA_ARGS__)
+
+int mana_logtype_driver;
+int mana_logtype_init;
+
+const struct eth_dev_ops mana_dev_ops = {
+};
+
+const struct eth_dev_ops mana_dev_sec_ops = {
+};
It may be better to expand 'sec' to secondary to not confuse with
security etc...
+
+uint16_t
+mana_rx_burst_removed(void *dpdk_rxq __rte_unused,
+ struct rte_mbuf **pkts __rte_unused,
+ uint16_t pkts_n __rte_unused)
+{
+ rte_mb();
+ return 0;
+}
+
+uint16_t
+mana_tx_burst_removed(void *dpdk_rxq __rte_unused,
+ struct rte_mbuf **pkts __rte_unused,
+ uint16_t pkts_n __rte_unused)
+{
+ rte_mb();
+ return 0;
+}
+
+static const char *mana_init_args[] = {
+ "mac",
+ NULL,
+};
+
+/* Support of parsing up to 8 mac address from EAL command line */
+#define MAX_NUM_ADDRESS 8
+struct mana_conf {
+ struct rte_ether_addr mac_array[MAX_NUM_ADDRESS];
+ unsigned int index;
+};
+
+static int mana_arg_parse_callback(const char *key, const char *val,
+ void *private)
Since this is new driver, better to follow the coding convention:
https://doc.dpdk.org/guides/contributing/coding_style.html
Please put return type to another line:
static int
mana_arg_parse_callback(const char *key, const char *val, void *private)
+{
+ struct mana_conf *conf = (struct mana_conf *)private;
+ int ret;
+
+ DRV_LOG(INFO, "key=%s value=%s index=%d", key, val, conf->index);
+
+ if (conf->index >= MAX_NUM_ADDRESS) {
+ DRV_LOG(ERR, "Exceeding max MAC address");
+ return 1;
+ }
+
+ ret = rte_ether_unformat_addr(val, &conf->mac_array[conf->index]);
+ if (ret) {
+ DRV_LOG(ERR, "Invalid MAC address %s", val);
+ return ret;
+ }
+
+ conf->index++;
+
+ return 0;
+}
+
<...>
+static int get_port_mac(struct ibv_device *device, unsigned int port,
+ struct rte_ether_addr *addr)
+{
+ FILE *file;
+ int ret = 0;
+ DIR *dir;
+ struct dirent *dent;
+ unsigned int dev_port;
+ char mac[20];
+
+ MKSTR(path, "%s/device/net", device->ibdev_path);
+
+ dir = opendir(path);
+ if (!dir)
+ return -ENOENT;
+
+ while ((dent = readdir(dir))) {
+ char *name = dent->d_name;
+
+ MKSTR(filepath, "%s/%s/dev_port", path, name);
+
+ /* Ignore . and .. */
+ if ((name[0] == '.') &&
+ ((name[1] == '\0') ||
+ ((name[1] == '.') && (name[2] == '\0'))))
+ continue;
+
+ file = fopen(filepath, "r");
+ if (!file)
+ continue;
+
+ ret = fscanf(file, "%u", &dev_port);
+ fclose(file);
+
+ if (ret != 1)
+ continue;
+
+ /* Ethernet ports start at 0, IB port start at 1 */
+ if (dev_port == port - 1) {
+ MKSTR(filepath, "%s/%s/address", path, name);
'MKSTR' macro adds two variables related with first argument, 'filepath'
already used above. Yes there is a new scope but better to not define
new variables, can you select a new name here?
<...>
+
+static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv __rte_unused,
This is a static function, if you don't use 'pci_drv', why not drop it
from the argument list.
+ struct rte_pci_device *pci_dev,
+ struct rte_ether_addr *mac_addr)
+{
+ struct ibv_device **ibv_list;
+ int ibv_idx;
+ struct ibv_context *ctx;
+ struct ibv_device_attr_ex dev_attr;
+ int num_devices;
+ int ret = 0;
+ uint8_t port;
+ struct mana_priv *priv = NULL;
+ struct rte_eth_dev *eth_dev = NULL;
+ bool found_port;
+
+ ibv_list = ibv_get_device_list(&num_devices);
+ for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
+ struct ibv_device *ibdev = ibv_list[ibv_idx];
+ struct rte_pci_addr pci_addr;
+
+ DRV_LOG(INFO, "Probe device name %s dev_name %s ibdev_path %s",
+ ibdev->name, ibdev->dev_name, ibdev->ibdev_path);
+
+ if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
+ continue;
+
+ /* Ignore if this IB device is not this PCI device */
+ if (pci_dev->addr.domain != pci_addr.domain ||
+ pci_dev->addr.bus != pci_addr.bus ||
+ pci_dev->addr.devid != pci_addr.devid ||
+ pci_dev->addr.function != pci_addr.function)
+ continue;
+
As far as I understand, intention of this loop is to find 'ibdev'
matching this device, code gooes through all "ibv device list" for this,
I wonder if there is a easy way for doing this, like a sysfs entry to
help getting this information?
And how mlx4/5 does this?
+ ctx = ibv_open_device(ibdev);
+ if (!ctx) {
+ DRV_LOG(ERR, "Failed to open IB device %s",
+ ibdev->name);
+ continue;
+ }
+
+ ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
+ DRV_LOG(INFO, "dev_attr.orig_attr.phys_port_cnt %u",
+ dev_attr.orig_attr.phys_port_cnt);
+ found_port = false;
+
+ for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
+ port++) {
+ struct ibv_parent_domain_init_attr attr = {};
"= { 0 };" for portability.
<...>
+static int mana_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+ struct rte_pci_device *pci_dev)
+{
+ struct rte_devargs *args = pci_dev->device.devargs;
+ struct mana_conf conf = {};
afaik, this is not part of c spec yet, why not initialize as " = {0}".
+ unsigned int i;
+ int ret;
+
+ if (args && args->args) {
You can prefer 'args->drv_str', which is newer name of the args.
<...>
+static const struct rte_pci_id mana_pci_id_map[] = {
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT,
+ PCI_DEVICE_ID_MICROSOFT_MANA)
+ },
PCI ID list should be terminated with ".vendor_id = 0", otherwise PCI
bus scan loop may behave unexpectedly.
+};
+
+static struct rte_pci_driver mana_pci_driver = {
+ .driver = {
+ .name = "mana_pci",
driver names are mostly like 'net_<driver_name>', is there a reason to
diverge from it?
Also if you use 'RTE_PMD_REGISTER_PCI' macro, it will be standardised
anyway.
+ },
+ .id_table = mana_pci_id_map,
+ .probe = mana_pci_probe,
+ .remove = mana_pci_remove,
+ .drv_flags = RTE_PCI_DRV_INTR_RMV,
+};
+
+RTE_INIT(rte_mana_pmd_init)
+{
+ rte_pci_register(&mana_pci_driver);
+}
+
Why not using 'RTE_PMD_REGISTER_PCI()' macro instead?
+RTE_PMD_EXPORT_NAME(net_mana, __COUNTER__);
+RTE_PMD_REGISTER_PCI_TABLE(net_mana, mana_pci_id_map);
+RTE_PMD_REGISTER_KMOD_DEP(net_mana, "* ib_uverbs & mana_ib");
+RTE_LOG_REGISTER_SUFFIX(mana_logtype_init, init, NOTICE);
+RTE_LOG_REGISTER_SUFFIX(mana_logtype_driver, driver, NOTICE);
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
new file mode 100644
index 0000000000..e30c030b4e
--- /dev/null
+++ b/drivers/net/mana/mana.h
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#ifndef __MANA_H__
+#define __MANA_H__
+
+enum {
+ PCI_VENDOR_ID_MICROSOFT = 0x1414,
+};
+
+enum {
+ PCI_DEVICE_ID_MICROSOFT_MANA = 0x00ba,
+};
+
+/* Shared data between primary/secondary processes */
+struct mana_shared_data {
+ rte_spinlock_t lock;
+ int init_done;
+ unsigned int primary_cnt;
+ unsigned int secondary_cnt;
+};
+
+#define MIN_RX_BUF_SIZE 1024
+#define MAX_FRAME_SIZE RTE_ETHER_MAX_LEN
+#define BNIC_MAX_MAC_ADDR 1
+
What 'BNIC_' prefix stands for? If it is related to the PMD, what do you
think to use 'MANA_' as prefix?
Same for multiple macros below.
<...>
+
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+
+const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev);
+
This function is not defined in this patch, so can drop declarataion.
<...>
diff --git a/drivers/net/mana/version.map b/drivers/net/mana/version.map
new file mode 100644
index 0000000000..c2e0723b4c
--- /dev/null
+++ b/drivers/net/mana/version.map
@@ -0,0 +1,3 @@
+DPDK_22 {
It is 'DPDK_23' now.