Would something even more generic such as rmaps_base_dist_network_device be better. Could update the code to detect Gemini or Ares for example.
-Nathan On Wed, Aug 28, 2013 at 04:36:22PM +0000, Jeff Squyres (jsquyres) wrote: > Can we rename rmaps_base_dist_hca to something that is less specific to IB? > > E.g., rmaps_base_dist_verbs_device? (admittedly, that's a little long, > but...) > > > > On Aug 28, 2013, at 12:23 PM, <svn-commit-mai...@open-mpi.org> wrote: > > > Author: jladd (Joshua Ladd) > > Date: 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013) > > New Revision: 29079 > > URL: https://svn.open-mpi.org/trac/ompi/changeset/29079 > > > > Log: > > Add support for autodetecting a MLNX HCA in the rmaps min distance feature. > > In this way, .ini files distributed with software stacks need not specify a > > particular HCA but instead may select the key word auto which will > > automatically select the discovered device. To use this feature, simply > > pass the keyword auto instead of a specific device name, --mca > > rmaps_base_dist_hca auto. If more than one card is installed, the mapper > > will inform the user of this and, at this point, the user will then need to > > specify which card via the normal route, e.g. --mca rmaps_base_dist_hca > > <dev_name>. This should be added to > > \ncmr=v1.7.4:reviewer=rhc:subject=Autodetect logic for min dist mapping > > > > Text files modified: > > trunk/opal/mca/hwloc/base/base.h | 4 ++-- > > > > trunk/opal/mca/hwloc/base/hwloc_base_util.c | 40 > > ++++++++++++++++++++++++++++++++++++---- > > trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt | 8 ++++++++ > > > > trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c | 11 +++++++++-- > > > > 4 files changed, 55 insertions(+), 8 deletions(-) > > > > Modified: trunk/opal/mca/hwloc/base/base.h > > ============================================================================== > > --- trunk/opal/mca/hwloc/base/base.h Wed Aug 28 12:03:23 2013 > > (r29078) > > +++ trunk/opal/mca/hwloc/base/base.h 2013-08-28 12:23:33 EDT (Wed, > > 28 Aug 2013) (r29079) > > @@ -169,8 +169,8 @@ > > hwloc_obj_t obj, > > > > opal_hwloc_resource_type_t rtype); > > > > -OPAL_DECLSPEC void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, > > - const char* device_name, > > +OPAL_DECLSPEC int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, > > + char* device_name, > > opal_list_t *sorted_list); > > > > /** > > > > Modified: trunk/opal/mca/hwloc/base/hwloc_base_util.c > > ============================================================================== > > --- trunk/opal/mca/hwloc/base/hwloc_base_util.c Wed Aug 28 12:03:23 > > 2013 (r29078) > > +++ trunk/opal/mca/hwloc/base/hwloc_base_util.c 2013-08-28 12:23:33 EDT > > (Wed, 28 Aug 2013) (r29079) > > @@ -1729,7 +1729,7 @@ > > } > > } > > > > -static void sort_by_dist(hwloc_topology_t topo, const char* device_name, > > opal_list_t *sorted_list) > > +static void sort_by_dist(hwloc_topology_t topo, char* device_name, > > opal_list_t *sorted_list) > > { > > hwloc_obj_t device_obj = NULL; > > hwloc_obj_t obj = NULL, root = NULL; > > @@ -1751,6 +1751,9 @@ > > obj = obj->parent; > > } > > if (obj == NULL) { > > + opal_output_verbose(5, > > opal_hwloc_base_framework.framework_output, > > + "hwloc:base:get_sorted_numa_list: NUMA node > > closest to %s wasn't found.", > > + device_name); > > return; > > } else { > > close_node_index = obj->logical_index; > > @@ -1762,6 +1765,8 @@ > > /* we can try to find distances under group object. > > This info can be there. */ > > depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NODE); > > if (depth < 0) { > > + opal_output_verbose(5, > > opal_hwloc_base_framework.framework_output, > > + "hwloc:base:get_sorted_numa_list: There is > > no information about distances on the node."); > > return; > > } > > root = hwloc_get_root_obj(topo); > > @@ -1779,6 +1784,8 @@ > > } > > /* find all distances for our close node with logical index > > = close_node_index as close_node_index + nbobjs*j */ > > if ((NULL == distances) || (0 == distances->nbobjs)) { > > + opal_output_verbose(5, > > opal_hwloc_base_framework.framework_output, > > + "hwloc:base:get_sorted_numa_list: There is no > > information about distances on the node."); > > return; > > } > > /* fill list of numa nodes */ > > @@ -1797,13 +1804,28 @@ > > } > > } > > > > -void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, const char* > > device_name, opal_list_t *sorted_list) > > +static int find_devices(hwloc_topology_t topo, char* device_name) > > +{ > > + hwloc_obj_t device_obj = NULL; > > + int count = 0; > > + for (device_obj = hwloc_get_obj_by_type(topo, HWLOC_OBJ_OS_DEVICE, 0); > > device_obj; device_obj = hwloc_get_next_osdev(topo, device_obj)) { > > + if (device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) { > > + count++; > > + free(device_name); > > + device_name = strdup(device_obj->name); > > + } > > + } > > + return count; > > +} > > + > > +int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* > > device_name, opal_list_t *sorted_list) > > { > > hwloc_obj_t obj; > > opal_list_item_t *item; > > opal_hwloc_summary_t *sum; > > opal_hwloc_topo_data_t *data; > > orte_rmaps_numa_node_t *numa, *copy_numa; > > + int count; > > > > obj = hwloc_get_root_obj(topo); > > > > @@ -1823,9 +1845,19 @@ > > copy_numa->dist_from_closed = > > numa->dist_from_closed; > > opal_list_append(sorted_list, ©_numa->super); > > } > > - return; > > + return 0; > > }else { > > /* don't already know it - go get it */ > > + /* firstly we check if we need to autodetect > > OpenFabrics devices or we have the specified one */ > > + if (!strcmp(device_name, "auto")) { > > + count = find_devices(topo, device_name); > > + if (count > 1) { > > + return count; > > + } > > + } > > + if (!device_name || (strlen(device_name) == 0)) { > > + return 1; > > + } > > sort_by_dist(topo, device_name, sorted_list); > > /* store this info in summary object for later usage */ > > OPAL_LIST_FOREACH(numa, sorted_list, > > orte_rmaps_numa_node_t) { > > @@ -1834,7 +1866,7 @@ > > copy_numa->dist_from_closed = > > numa->dist_from_closed; > > opal_list_append(&(sum->sorted_by_dist_list), > > ©_numa->super); > > } > > - return; > > + return 0; > > } > > } > > } > > > > Modified: trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt > > ============================================================================== > > --- trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt Wed Aug 28 > > 12:03:23 2013 (r29078) > > +++ trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt 2013-08-28 > > 12:23:33 EDT (Wed, 28 Aug 2013) (r29079) > > @@ -29,3 +29,11 @@ > > Node: %s > > > > Open MPI therefore cannot mapp the application as specified. > > +# > > +[orte-rmaps-mindist:several-hca-devices] > > +There are several OpenFabrics devices found on at least one node. Please > > specify the definite one. > > + > > + Devices: %d > > + Node: %s > > + > > +Open MPI therefore cannot mapp the application as specified. > > > > Modified: trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c > > ============================================================================== > > --- trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c Wed Aug 28 > > 12:03:23 2013 (r29078) > > +++ trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c 2013-08-28 > > 12:23:33 EDT (Wed, 28 Aug 2013) (r29079) > > @@ -71,6 +71,7 @@ > > mca_base_component_t *c = &mca_rmaps_mindist_component.base_version; > > bool initial_map=true; > > bool bynode = false; > > + int ret; > > > > /* this mapper can only handle initial launch > > * when mindist mapping is desired > > @@ -245,7 +246,13 @@ > > * so we call opal_hwloc_base_get_nbobjs_by_type */ > > opal_hwloc_base_get_nbobjs_by_type(node->topology, > > HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE); > > OBJ_CONSTRUCT(&numa_list, opal_list_t); > > - opal_hwloc_get_sorted_numa_list(node->topology, > > orte_rmaps_base.device, &numa_list); > > + ret = opal_hwloc_get_sorted_numa_list(node->topology, > > orte_rmaps_base.device, &numa_list); > > + if (ret > 1) { > > + orte_show_help("help-orte-rmaps-md.txt", > > "orte-rmaps-mindist:several-hca-devices", > > + true, ret, node->name); > > + rc = ORTE_ERR_SILENT; > > + goto error; > > + } > > if (opal_list_get_size(&numa_list) > 0) { > > j = 0; > > required = 0; > > @@ -390,7 +397,7 @@ > > } > > OBJ_DESTRUCT(&node_list); > > } > > - > > + free(orte_rmaps_base.device); > > return ORTE_SUCCESS; > > > > error: > > _______________________________________________ > > svn-full mailing list > > svn-f...@open-mpi.org > > http://www.open-mpi.org/mailman/listinfo.cgi/svn-full > > > -- > Jeff Squyres > jsquy...@cisco.com > For corporate legal information go to: > http://www.cisco.com/web/about/doing_business/legal/cri/ > > _______________________________________________ > devel mailing list > de...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/devel