All the generic probing code needs is an IB device. While this device is
currently supplied by a PCI lookup, other methods will be added soon.

This patch divides the original function, which has become huge over time,
as follows:

1. PCI-specific (mlx5_pci_probe()).
2. All ports of a Verbs device (mlx5_dev_spawn()).
3. A given port of a Verbs device (mlx5_dev_spawn_one()).

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarg...@6wind.com>
--
v2 changes:

- Fixed device naming. A port suffix is now appended only if several IB
  ports happen to be detected.
- Added separate message to distinguish missing kernel drivers from other
  initialization errors, as it was confusing.
---
 drivers/net/mlx5/mlx5.c | 340 ++++++++++++++++++++++++++-----------------
 1 file changed, 209 insertions(+), 131 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1a5391e63..01dcf25b9 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -635,30 +635,34 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
 }
 
 /**
- * DPDK callback to register a PCI device.
- *
- * This function creates an Ethernet device for each port of a given
- * PCI device.
+ * Spawn an Ethernet device from Verbs information.
  *
- * @param[in] pci_drv
- *   PCI driver structure (mlx5_driver).
- * @param[in] pci_dev
- *   PCI device information.
+ * @param dpdk_dev
+ *   Backing DPDK device.
+ * @param ibv_dev
+ *   Verbs device.
+ * @param vf
+ *   If nonzero, enable VF-specific features.
+ * @param[in] attr
+ *   Verbs device attributes.
+ * @param port
+ *   Verbs port to use (indexed from 1).
  *
  * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
+ *   A valid Ethernet device object on success, NULL otherwise and rte_errno
+ *   is set.
  */
-static int
-mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
-              struct rte_pci_device *pci_dev)
+static struct rte_eth_dev *
+mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
+                  struct ibv_device *ibv_dev,
+                  int vf,
+                  const struct ibv_device_attr_ex *attr,
+                  unsigned int port)
 {
-       struct ibv_device **list = NULL;
-       struct ibv_device *ibv_dev;
-       struct ibv_context *ctx = NULL;
-       struct ibv_device_attr_ex attr;
+       struct ibv_context *ctx;
        struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+       struct rte_eth_dev *eth_dev = NULL;
        int err = 0;
-       unsigned int vf = 0;
        unsigned int mps;
        unsigned int cqe_comp;
        unsigned int tunnel_en = 0;
@@ -670,71 +674,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
__rte_unused,
        unsigned int mprq_max_stride_size_n = 0;
        unsigned int mprq_min_stride_num_n = 0;
        unsigned int mprq_max_stride_num_n = 0;
-       int i;
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
        struct ibv_counter_set_description cs_desc = { .counter_type = 0 };
 #endif
 
        /* Prepare shared data between primary and secondary process. */
        mlx5_prepare_shared_data();
-       assert(pci_drv == &mlx5_driver);
-       list = mlx5_glue->get_device_list(&i);
-       if (list == NULL) {
-               assert(errno);
-               err = errno;
-               if (errno == ENOSYS)
-                       DRV_LOG(ERR,
-                               "cannot list devices, is ib_uverbs loaded?");
-               goto error;
-       }
-       assert(i >= 0);
-       /*
-        * For each listed device, check related sysfs entry against
-        * the provided PCI ID.
-        */
-       while (i != 0) {
-               struct rte_pci_addr pci_addr;
-
-               --i;
-               DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name);
-               if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
-                       continue;
-               if ((pci_dev->addr.domain != pci_addr.domain) ||
-                   (pci_dev->addr.bus != pci_addr.bus) ||
-                   (pci_dev->addr.devid != pci_addr.devid) ||
-                   (pci_dev->addr.function != pci_addr.function))
-                       continue;
-               DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
-                       list[i]->name);
-               vf = ((pci_dev->id.device_id ==
-                      PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
-                     (pci_dev->id.device_id ==
-                      PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
-                     (pci_dev->id.device_id ==
-                      PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
-                     (pci_dev->id.device_id ==
-                      PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
-               ctx = mlx5_glue->open_device(list[i]);
-               rte_errno = errno;
-               err = rte_errno;
-               break;
-       }
-       if (ctx == NULL) {
-               switch (err) {
-               case 0:
-                       DRV_LOG(ERR,
-                               "cannot access device, is mlx5_ib loaded?");
-                       err = ENODEV;
-                       break;
-               case EINVAL:
-                       DRV_LOG(ERR,
-                               "cannot use device, are drivers up to date?");
-                       break;
-               }
-               goto error;
+       errno = 0;
+       ctx = mlx5_glue->open_device(ibv_dev);
+       if (!ctx) {
+               rte_errno = errno ? errno : ENODEV;
+               return NULL;
        }
-       ibv_dev = list[i];
-       DRV_LOG(DEBUG, "device opened");
 #ifdef HAVE_IBV_MLX5_MOD_SWP
        dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
 #endif
@@ -822,20 +773,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
__rte_unused,
        DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
                " old OFED/rdma-core version or firmware configuration");
 #endif
-       err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
-       if (err) {
-               DEBUG("ibv_query_device_ex() failed");
-               goto error;
-       }
-       DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
-       for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
+       {
                char name[RTE_ETH_NAME_MAX_LEN];
-               int len;
-               uint32_t port = i + 1; /* ports are indexed from one */
                struct ibv_port_attr port_attr;
                struct ibv_pd *pd = NULL;
                struct priv *priv = NULL;
-               struct rte_eth_dev *eth_dev = NULL;
                struct ether_addr mac;
                struct mlx5_dev_config config = {
                        .cqe_comp = cqe_comp,
@@ -859,11 +801,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
__rte_unused,
                        },
                };
 
-               len = snprintf(name, sizeof(name), PCI_PRI_FMT,
-                        pci_dev->addr.domain, pci_dev->addr.bus,
-                        pci_dev->addr.devid, pci_dev->addr.function);
-               if (attr.orig_attr.phys_port_cnt > 1)
-                       snprintf(name + len, sizeof(name), " port %u", i);
+               if (attr->orig_attr.phys_port_cnt > 1)
+                       snprintf(name, sizeof(name), "%s port %u",
+                                dpdk_dev->name, port);
+               else
+                       snprintf(name, sizeof(name), "%s", dpdk_dev->name);
                if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
                        eth_dev = rte_eth_dev_attach_secondary(name);
                        if (eth_dev == NULL) {
@@ -872,7 +814,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
                                err = rte_errno;
                                goto error;
                        }
-                       eth_dev->device = &pci_dev->device;
+                       eth_dev->device = dpdk_dev;
                        eth_dev->dev_ops = &mlx5_dev_sec_ops;
                        err = mlx5_uar_init_secondary(eth_dev);
                        if (err) {
@@ -900,16 +842,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
__rte_unused,
                                mlx5_select_rx_function(eth_dev);
                        eth_dev->tx_pkt_burst =
                                mlx5_select_tx_function(eth_dev);
-                       rte_eth_dev_probing_finish(eth_dev);
-                       continue;
+                       mlx5_glue->close_device(ctx);
+                       return eth_dev;
                }
                DRV_LOG(DEBUG, "using port %u", port);
-               if (!ctx)
-                       ctx = mlx5_glue->open_device(ibv_dev);
-               if (ctx == NULL) {
-                       err = ENODEV;
-                       goto port_error;
-               }
                /* Check port status. */
                err = mlx5_glue->query_port(ctx, port, &port_attr);
                if (err) {
@@ -947,23 +883,23 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
__rte_unused,
                priv->ctx = ctx;
                strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
                        sizeof(priv->ibdev_path));
-               priv->device_attr = attr;
+               priv->device_attr = *attr;
                priv->port = port;
                priv->pd = pd;
                priv->mtu = ETHER_MTU;
-               err = mlx5_args(&config, pci_dev->device.devargs);
+               err = mlx5_args(&config, dpdk_dev->devargs);
                if (err) {
                        err = rte_errno;
                        DRV_LOG(ERR, "failed to process device arguments: %s",
                                strerror(rte_errno));
                        goto port_error;
                }
-               config.hw_csum = !!(attr.device_cap_flags_ex &
+               config.hw_csum = !!(attr->device_cap_flags_ex &
                                    IBV_DEVICE_RAW_IP_CSUM);
                DRV_LOG(DEBUG, "checksum offloading is %ssupported",
                        (config.hw_csum ? "" : "not "));
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
-               config.flow_counter_en = !!attr.max_counter_sets;
+               config.flow_counter_en = !!attr->max_counter_sets;
                mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
                DRV_LOG(DEBUG,
                        "counter type = %d, num of cs = %ld, attributes = %d",
@@ -971,7 +907,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
                        cs_desc.attributes);
 #endif
                config.ind_table_max_size =
-                       attr.rss_caps.max_rwq_indirection_table_size;
+                       attr->rss_caps.max_rwq_indirection_table_size;
                /* Remove this check once DPDK supports larger/variable
                 * indirection tables. */
                if (config.ind_table_max_size >
@@ -979,28 +915,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
__rte_unused,
                        config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
                DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
                        config.ind_table_max_size);
-               config.hw_vlan_strip = !!(attr.raw_packet_caps &
+               config.hw_vlan_strip = !!(attr->raw_packet_caps &
                                         IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
                DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
                        (config.hw_vlan_strip ? "" : "not "));
 
-               config.hw_fcs_strip = !!(attr.raw_packet_caps &
+               config.hw_fcs_strip = !!(attr->raw_packet_caps &
                                         IBV_RAW_PACKET_CAP_SCATTER_FCS);
                DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
                        (config.hw_fcs_strip ? "" : "not "));
 
 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
-               config.hw_padding = !!attr.rx_pad_end_addr_align;
+               config.hw_padding = !!attr->rx_pad_end_addr_align;
 #endif
                DRV_LOG(DEBUG,
                        "hardware Rx end alignment padding is %ssupported",
                        (config.hw_padding ? "" : "not "));
                config.vf = vf;
-               config.tso = (attr.tso_caps.max_tso > 0 &&
-                             (attr.tso_caps.supported_qpts &
+               config.tso = (attr->tso_caps.max_tso > 0 &&
+                             (attr->tso_caps.supported_qpts &
                               (1 << IBV_QPT_RAW_PACKET)));
                if (config.tso)
-                       config.tso_max_payload_sz = attr.tso_caps.max_tso;
+                       config.tso_max_payload_sz = attr->tso_caps.max_tso;
                if (config.mps && !mps) {
                        DRV_LOG(ERR,
                                "multi-packet send not supported on this device"
@@ -1041,8 +977,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
                eth_dev->data->dev_private = priv;
                priv->dev_data = eth_dev->data;
                eth_dev->data->mac_addrs = priv->mac;
-               eth_dev->device = &pci_dev->device;
-               rte_eth_copy_pci_info(eth_dev, pci_dev);
+               eth_dev->device = dpdk_dev;
                eth_dev->device->driver = &mlx5_driver.driver;
                err = mlx5_uar_init_primary(eth_dev);
                if (err) {
@@ -1160,13 +1095,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
__rte_unused,
                                 priv, mem_event_cb);
                rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
                rte_eth_dev_probing_finish(eth_dev);
-               /*
-                * Each eth_dev instance is assigned its own Verbs context,
-                * since this one is consumed, let the next iteration open
-                * another.
-                */
-               ctx = NULL;
-               continue;
+               return eth_dev;
 port_error:
                if (priv)
                        rte_free(priv);
@@ -1174,24 +1103,173 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
__rte_unused,
                        claim_zero(mlx5_glue->dealloc_pd(pd));
                if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
                        rte_eth_dev_release_port(eth_dev);
-               break;
        }
-       /*
-        * XXX if something went wrong in the loop above, there is a resource
-        * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
-        * long as the dpdk does not provide a way to deallocate a ethdev and a
-        * way to enumerate the registered ethdevs to free the previous ones.
-        */
 error:
        if (ctx)
                claim_zero(mlx5_glue->close_device(ctx));
-       if (list)
-               mlx5_glue->free_device_list(list);
-       if (err) {
-               rte_errno = err;
+       assert(err > 0);
+       rte_errno = err;
+       return NULL;
+}
+
+/**
+ * Spawn Ethernet devices from Verbs information, one per detected port.
+ *
+ * @param dpdk_dev
+ *   Backing DPDK device.
+ * @param ibv_dev
+ *   Verbs device.
+ * @param vf
+ *   If nonzero, enable VF-specific features.
+ *
+ * @return
+ *   A NULL-terminated list of Ethernet device objects on success, NULL
+ *   otherwise and rte_errno is set. Caller is expected to release list
+ *   memory through free().
+ */
+static struct rte_eth_dev **
+mlx5_dev_spawn(struct rte_device *dpdk_dev,
+              struct ibv_device *ibv_dev,
+              int vf)
+{
+       struct rte_eth_dev **eth_list = NULL;
+       struct ibv_context *ctx;
+       struct ibv_device_attr_ex attr;
+       unsigned int i;
+       int ret;
+
+       errno = 0;
+       ctx = mlx5_glue->open_device(ibv_dev);
+       if (!ctx) {
+               rte_errno = errno ? errno : ENODEV;
+               if (rte_errno == ENODEV)
+                       DRV_LOG(ERR,
+                               "cannot access device, is mlx5_ib loaded?");
+               else
+                       DRV_LOG(ERR,
+                               "cannot use device, are drivers up to date?");
+               return NULL;
+       }
+       ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
+       mlx5_glue->close_device(ctx);
+       if (ret) {
+               rte_errno = ret;
+               DRV_LOG(ERR, "unable to query device information: %s",
+                       strerror(rte_errno));
+               return NULL;
+       }
+       DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
+       eth_list = malloc(sizeof(*eth_list) *
+                         (attr.orig_attr.phys_port_cnt + 1));
+       if (!eth_list) {
+               rte_errno = errno;
+               return NULL;
+       }
+       for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
+               eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
+                                                &attr, i + 1);
+               if (eth_list[i])
+                       continue;
+               /* Save rte_errno and roll back in case of failure. */
+               ret = rte_errno;
+               while (i--) {
+                       mlx5_dev_close(eth_list[i]);
+                       if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+                               rte_free(eth_list[i]->data->dev_private);
+                       claim_zero(rte_eth_dev_release_port(eth_list[i]));
+               }
+               free(eth_list);
+               rte_errno = ret;
+               return NULL;
+       }
+       eth_list[i] = NULL;
+       return eth_list;
+}
+
+/**
+ * DPDK callback to register a PCI device.
+ *
+ * This function creates an Ethernet device for each port of a given
+ * PCI device.
+ *
+ * @param[in] pci_drv
+ *   PCI driver structure (mlx5_driver).
+ * @param[in] pci_dev
+ *   PCI device information.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+              struct rte_pci_device *pci_dev)
+{
+       struct ibv_device **ibv_list;
+       struct rte_eth_dev **eth_list = NULL;
+       int vf;
+       int ret;
+
+       assert(pci_drv == &mlx5_driver);
+       switch (pci_dev->id.device_id) {
+       case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+       case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+       case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
+       case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
+               vf = 1;
+               break;
+       default:
+               vf = 0;
+       }
+       errno = 0;
+       ibv_list = mlx5_glue->get_device_list(&ret);
+       if (!ibv_list) {
+               rte_errno = errno ? errno : ENOSYS;
+               DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
                return -rte_errno;
        }
-       return 0;
+       while (ret-- > 0) {
+               struct rte_pci_addr pci_addr;
+
+               DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
+               if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
+                       continue;
+               if (pci_dev->addr.domain != pci_addr.domain ||
+                   pci_dev->addr.bus != pci_addr.bus ||
+                   pci_dev->addr.devid != pci_addr.devid ||
+                   pci_dev->addr.function != pci_addr.function)
+                       continue;
+               DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
+                       ibv_list[ret]->name);
+               break;
+       }
+       if (ret >= 0)
+               eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
+       mlx5_glue->free_device_list(ibv_list);
+       if (!ret) {
+               DRV_LOG(WARNING,
+                       "no Verbs device matches PCI device " PCI_PRI_FMT ","
+                       " are kernel drivers loaded?",
+                       pci_dev->addr.domain, pci_dev->addr.bus,
+                       pci_dev->addr.devid, pci_dev->addr.function);
+               rte_errno = ENOENT;
+               ret = -rte_errno;
+       } else if (!eth_list || !*eth_list) {
+               DRV_LOG(ERR,
+                       "probe of PCI device " PCI_PRI_FMT " aborted after"
+                       " encountering an error: %s",
+                       pci_dev->addr.domain, pci_dev->addr.bus,
+                       pci_dev->addr.devid, pci_dev->addr.function,
+                       strerror(rte_errno));
+               ret = -rte_errno;
+       } else {
+               for (ret = 0; eth_list[ret]; ++ret) {
+                       rte_eth_copy_pci_info(eth_list[ret], pci_dev);
+                       rte_eth_dev_probing_finish(eth_list[ret]);
+               }
+               ret = 0;
+       }
+       free(eth_list);
+       return ret;
 }
 
 static const struct rte_pci_id mlx5_pci_id_map[] = {
-- 
2.11.0

Reply via email to