Can we rename rmaps_base_dist_hca to something that is less specific to IB?
E.g., rmaps_base_dist_verbs_device? (admittedly, that's a little long, but...) On Aug 28, 2013, at 12:23 PM, <svn-commit-mai...@open-mpi.org> wrote: > Author: jladd (Joshua Ladd) > Date: 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013) > New Revision: 29079 > URL: https://svn.open-mpi.org/trac/ompi/changeset/29079 > > Log: > Add support for autodetecting a MLNX HCA in the rmaps min distance feature. > In this way, .ini files distributed with software stacks need not specify a > particular HCA but instead may select the key word auto which will > automatically select the discovered device. To use this feature, simply pass > the keyword auto instead of a specific device name, --mca rmaps_base_dist_hca > auto. If more than one card is installed, the mapper will inform the user of > this and, at this point, the user will then need to specify which card via > the normal route, e.g. --mca rmaps_base_dist_hca <dev_name>. This should be > added to \ncmr=v1.7.4:reviewer=rhc:subject=Autodetect logic for min dist > mapping > > Text files modified: > trunk/opal/mca/hwloc/base/base.h | 4 ++-- > > trunk/opal/mca/hwloc/base/hwloc_base_util.c | 40 > ++++++++++++++++++++++++++++++++++++---- > trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt | 8 ++++++++ > > trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c | 11 +++++++++-- > > 4 files changed, 55 insertions(+), 8 deletions(-) > > Modified: trunk/opal/mca/hwloc/base/base.h > ============================================================================== > --- trunk/opal/mca/hwloc/base/base.h Wed Aug 28 12:03:23 2013 (r29078) > +++ trunk/opal/mca/hwloc/base/base.h 2013-08-28 12:23:33 EDT (Wed, 28 Aug > 2013) (r29079) > @@ -169,8 +169,8 @@ > hwloc_obj_t obj, > > opal_hwloc_resource_type_t rtype); > > -OPAL_DECLSPEC void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, > - const char* device_name, > +OPAL_DECLSPEC int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, > + char* device_name, > opal_list_t *sorted_list); > > /** > > Modified: trunk/opal/mca/hwloc/base/hwloc_base_util.c > ============================================================================== > --- trunk/opal/mca/hwloc/base/hwloc_base_util.c Wed Aug 28 12:03:23 > 2013 (r29078) > +++ trunk/opal/mca/hwloc/base/hwloc_base_util.c 2013-08-28 12:23:33 EDT > (Wed, 28 Aug 2013) (r29079) > @@ -1729,7 +1729,7 @@ > } > } > > -static void sort_by_dist(hwloc_topology_t topo, const char* device_name, > opal_list_t *sorted_list) > +static void sort_by_dist(hwloc_topology_t topo, char* device_name, > opal_list_t *sorted_list) > { > hwloc_obj_t device_obj = NULL; > hwloc_obj_t obj = NULL, root = NULL; > @@ -1751,6 +1751,9 @@ > obj = obj->parent; > } > if (obj == NULL) { > + opal_output_verbose(5, > opal_hwloc_base_framework.framework_output, > + "hwloc:base:get_sorted_numa_list: NUMA node > closest to %s wasn't found.", > + device_name); > return; > } else { > close_node_index = obj->logical_index; > @@ -1762,6 +1765,8 @@ > /* we can try to find distances under group object. This > info can be there. */ > depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NODE); > if (depth < 0) { > + opal_output_verbose(5, > opal_hwloc_base_framework.framework_output, > + "hwloc:base:get_sorted_numa_list: There is > no information about distances on the node."); > return; > } > root = hwloc_get_root_obj(topo); > @@ -1779,6 +1784,8 @@ > } > /* find all distances for our close node with logical index = > close_node_index as close_node_index + nbobjs*j */ > if ((NULL == distances) || (0 == distances->nbobjs)) { > + opal_output_verbose(5, > opal_hwloc_base_framework.framework_output, > + "hwloc:base:get_sorted_numa_list: There is no > information about distances on the node."); > return; > } > /* fill list of numa nodes */ > @@ -1797,13 +1804,28 @@ > } > } > > -void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, const char* > device_name, opal_list_t *sorted_list) > +static int find_devices(hwloc_topology_t topo, char* device_name) > +{ > + hwloc_obj_t device_obj = NULL; > + int count = 0; > + for (device_obj = hwloc_get_obj_by_type(topo, HWLOC_OBJ_OS_DEVICE, 0); > device_obj; device_obj = hwloc_get_next_osdev(topo, device_obj)) { > + if (device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) { > + count++; > + free(device_name); > + device_name = strdup(device_obj->name); > + } > + } > + return count; > +} > + > +int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* > device_name, opal_list_t *sorted_list) > { > hwloc_obj_t obj; > opal_list_item_t *item; > opal_hwloc_summary_t *sum; > opal_hwloc_topo_data_t *data; > orte_rmaps_numa_node_t *numa, *copy_numa; > + int count; > > obj = hwloc_get_root_obj(topo); > > @@ -1823,9 +1845,19 @@ > copy_numa->dist_from_closed = numa->dist_from_closed; > opal_list_append(sorted_list, ©_numa->super); > } > - return; > + return 0; > }else { > /* don't already know it - go get it */ > + /* firstly we check if we need to autodetect OpenFabrics > devices or we have the specified one */ > + if (!strcmp(device_name, "auto")) { > + count = find_devices(topo, device_name); > + if (count > 1) { > + return count; > + } > + } > + if (!device_name || (strlen(device_name) == 0)) { > + return 1; > + } > sort_by_dist(topo, device_name, sorted_list); > /* store this info in summary object for later usage */ > OPAL_LIST_FOREACH(numa, sorted_list, > orte_rmaps_numa_node_t) { > @@ -1834,7 +1866,7 @@ > copy_numa->dist_from_closed = numa->dist_from_closed; > opal_list_append(&(sum->sorted_by_dist_list), > ©_numa->super); > } > - return; > + return 0; > } > } > } > > Modified: trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt > ============================================================================== > --- trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt Wed Aug 28 > 12:03:23 2013 (r29078) > +++ trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt 2013-08-28 > 12:23:33 EDT (Wed, 28 Aug 2013) (r29079) > @@ -29,3 +29,11 @@ > Node: %s > > Open MPI therefore cannot mapp the application as specified. > +# > +[orte-rmaps-mindist:several-hca-devices] > +There are several OpenFabrics devices found on at least one node. Please > specify the definite one. > + > + Devices: %d > + Node: %s > + > +Open MPI therefore cannot mapp the application as specified. > > Modified: trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c > ============================================================================== > --- trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c Wed Aug 28 > 12:03:23 2013 (r29078) > +++ trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c 2013-08-28 > 12:23:33 EDT (Wed, 28 Aug 2013) (r29079) > @@ -71,6 +71,7 @@ > mca_base_component_t *c = &mca_rmaps_mindist_component.base_version; > bool initial_map=true; > bool bynode = false; > + int ret; > > /* this mapper can only handle initial launch > * when mindist mapping is desired > @@ -245,7 +246,13 @@ > * so we call opal_hwloc_base_get_nbobjs_by_type */ > opal_hwloc_base_get_nbobjs_by_type(node->topology, > HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE); > OBJ_CONSTRUCT(&numa_list, opal_list_t); > - opal_hwloc_get_sorted_numa_list(node->topology, > orte_rmaps_base.device, &numa_list); > + ret = opal_hwloc_get_sorted_numa_list(node->topology, > orte_rmaps_base.device, &numa_list); > + if (ret > 1) { > + orte_show_help("help-orte-rmaps-md.txt", > "orte-rmaps-mindist:several-hca-devices", > + true, ret, node->name); > + rc = ORTE_ERR_SILENT; > + goto error; > + } > if (opal_list_get_size(&numa_list) > 0) { > j = 0; > required = 0; > @@ -390,7 +397,7 @@ > } > OBJ_DESTRUCT(&node_list); > } > - > + free(orte_rmaps_base.device); > return ORTE_SUCCESS; > > error: > _______________________________________________ > svn-full mailing list > svn-f...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/svn-full -- Jeff Squyres jsquy...@cisco.com For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/