Would something even more generic such as rmaps_base_dist_network_device
be better. Could update the code to detect Gemini or Ares for example.

-Nathan

On Wed, Aug 28, 2013 at 04:36:22PM +0000, Jeff Squyres (jsquyres) wrote:
> Can we rename rmaps_base_dist_hca to something that is less specific to IB?
> 
> E.g., rmaps_base_dist_verbs_device?  (admittedly, that's a little long, 
> but...)
> 
> 
> 
> On Aug 28, 2013, at 12:23 PM, <svn-commit-mai...@open-mpi.org> wrote:
> 
> > Author: jladd (Joshua Ladd)
> > Date: 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013)
> > New Revision: 29079
> > URL: https://svn.open-mpi.org/trac/ompi/changeset/29079
> > 
> > Log:
> > Add support for autodetecting a MLNX HCA in the rmaps min distance feature. 
> > In this way, .ini files distributed with software stacks need not specify a 
> > particular HCA but instead may select the key word auto which will 
> > automatically select the discovered device. To use this feature, simply 
> > pass the keyword auto instead of a specific device name, --mca 
> > rmaps_base_dist_hca auto. If more than one card is installed, the mapper 
> > will inform the user of this and, at this point, the user will then need to 
> > specify which card via the normal route, e.g. --mca rmaps_base_dist_hca 
> > <dev_name>. This should be added to 
> > \ncmr=v1.7.4:reviewer=rhc:subject=Autodetect logic for min dist mapping
> > 
> > Text files modified: 
> >   trunk/opal/mca/hwloc/base/base.h                    |     4 ++--          
> >                           
> >   trunk/opal/mca/hwloc/base/hwloc_base_util.c         |    40 
> > ++++++++++++++++++++++++++++++++++++----
> >   trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt |     8 ++++++++      
> >                           
> >   trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c |    11 +++++++++--   
> >                           
> >   4 files changed, 55 insertions(+), 8 deletions(-)
> > 
> > Modified: trunk/opal/mca/hwloc/base/base.h
> > ==============================================================================
> > --- trunk/opal/mca/hwloc/base/base.h        Wed Aug 28 12:03:23 2013        
> > (r29078)
> > +++ trunk/opal/mca/hwloc/base/base.h        2013-08-28 12:23:33 EDT (Wed, 
> > 28 Aug 2013)      (r29079)
> > @@ -169,8 +169,8 @@
> >                                                        hwloc_obj_t obj,
> >                                                        
> > opal_hwloc_resource_type_t rtype);
> > 
> > -OPAL_DECLSPEC void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, 
> > -                                    const char* device_name, 
> > +OPAL_DECLSPEC int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, 
> > +                                    char* device_name, 
> >                                     opal_list_t *sorted_list);
> > 
> > /**
> > 
> > Modified: trunk/opal/mca/hwloc/base/hwloc_base_util.c
> > ==============================================================================
> > --- trunk/opal/mca/hwloc/base/hwloc_base_util.c     Wed Aug 28 12:03:23 
> > 2013        (r29078)
> > +++ trunk/opal/mca/hwloc/base/hwloc_base_util.c     2013-08-28 12:23:33 EDT 
> > (Wed, 28 Aug 2013)      (r29079)
> > @@ -1729,7 +1729,7 @@
> >     }
> > }
> > 
> > -static void sort_by_dist(hwloc_topology_t topo, const char* device_name, 
> > opal_list_t *sorted_list)
> > +static void sort_by_dist(hwloc_topology_t topo, char* device_name, 
> > opal_list_t *sorted_list)
> > {
> >     hwloc_obj_t device_obj = NULL;
> >     hwloc_obj_t obj = NULL, root = NULL;
> > @@ -1751,6 +1751,9 @@
> >                     obj = obj->parent;
> >                 }
> >                 if (obj == NULL) {
> > +                    opal_output_verbose(5, 
> > opal_hwloc_base_framework.framework_output,
> > +                            "hwloc:base:get_sorted_numa_list: NUMA node 
> > closest to %s wasn't found.",
> > +                            device_name);
> >                     return;
> >                 } else {
> >                     close_node_index = obj->logical_index;
> > @@ -1762,6 +1765,8 @@
> >                     /* we can try to find distances under group object. 
> > This info can be there. */
> >                     depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NODE);
> >                     if (depth < 0) {
> > +                        opal_output_verbose(5, 
> > opal_hwloc_base_framework.framework_output,
> > +                                "hwloc:base:get_sorted_numa_list: There is 
> > no information about distances on the node.");
> >                         return;
> >                     }
> >                     root = hwloc_get_root_obj(topo);
> > @@ -1779,6 +1784,8 @@
> >                 }
> >                 /* find all distances for our close node with logical index 
> > = close_node_index as close_node_index + nbobjs*j */
> >                 if ((NULL == distances) || (0 == distances->nbobjs)) {
> > +                    opal_output_verbose(5, 
> > opal_hwloc_base_framework.framework_output,
> > +                            "hwloc:base:get_sorted_numa_list: There is no 
> > information about distances on the node.");
> >                     return;
> >                 }
> >                 /* fill list of numa nodes */
> > @@ -1797,13 +1804,28 @@
> >     }
> > }
> > 
> > -void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, const char* 
> > device_name, opal_list_t *sorted_list)
> > +static int find_devices(hwloc_topology_t topo, char* device_name) 
> > +{
> > +    hwloc_obj_t device_obj = NULL;
> > +    int count = 0;
> > +    for (device_obj = hwloc_get_obj_by_type(topo, HWLOC_OBJ_OS_DEVICE, 0); 
> > device_obj; device_obj = hwloc_get_next_osdev(topo, device_obj)) {
> > +        if (device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) {
> > +            count++;
> > +            free(device_name);
> > +            device_name = strdup(device_obj->name);
> > +        }
> > +    }
> > +    return count;
> > +}
> > +
> > +int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* 
> > device_name, opal_list_t *sorted_list)
> > {
> >     hwloc_obj_t obj;
> >     opal_list_item_t *item;
> >     opal_hwloc_summary_t *sum;
> >     opal_hwloc_topo_data_t *data;
> >     orte_rmaps_numa_node_t *numa, *copy_numa;
> > +    int count;
> > 
> >     obj = hwloc_get_root_obj(topo);
> > 
> > @@ -1823,9 +1845,19 @@
> >                         copy_numa->dist_from_closed = 
> > numa->dist_from_closed;
> >                         opal_list_append(sorted_list, &copy_numa->super);
> >                     }
> > -                    return;
> > +                    return 0;
> >                 }else {
> >                     /* don't already know it - go get it */
> > +                    /* firstly we check if we need to autodetect 
> > OpenFabrics  devices or we have the specified one */
> > +                    if (!strcmp(device_name, "auto")) {
> > +                        count = find_devices(topo, device_name);
> > +                       if (count > 1) {
> > +                           return count;
> > +                       }
> > +                    }
> > +                    if (!device_name || (strlen(device_name) == 0)) {
> > +                        return 1;
> > +                    }
> >                     sort_by_dist(topo, device_name, sorted_list);
> >                     /* store this info in summary object for later usage */
> >                     OPAL_LIST_FOREACH(numa, sorted_list, 
> > orte_rmaps_numa_node_t) {
> > @@ -1834,7 +1866,7 @@
> >                         copy_numa->dist_from_closed = 
> > numa->dist_from_closed;
> >                         opal_list_append(&(sum->sorted_by_dist_list), 
> > &copy_numa->super);
> >                     }
> > -                    return;
> > +                    return 0;
> >                 }
> >             }
> >         }
> > 
> > Modified: trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt
> > ==============================================================================
> > --- trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt     Wed Aug 28 
> > 12:03:23 2013        (r29078)
> > +++ trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt     2013-08-28 
> > 12:23:33 EDT (Wed, 28 Aug 2013)      (r29079)
> > @@ -29,3 +29,11 @@
> >   Node: %s
> > 
> > Open MPI therefore cannot mapp the application as specified.
> > +#
> > +[orte-rmaps-mindist:several-hca-devices]
> > +There are several OpenFabrics devices found on at least one node. Please 
> > specify the definite one.
> > +
> > +  Devices: %d
> > +  Node: %s
> > +
> > +Open MPI therefore cannot mapp the application as specified.
> > 
> > Modified: trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c
> > ==============================================================================
> > --- trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c     Wed Aug 28 
> > 12:03:23 2013        (r29078)
> > +++ trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c     2013-08-28 
> > 12:23:33 EDT (Wed, 28 Aug 2013)      (r29079)
> > @@ -71,6 +71,7 @@
> >     mca_base_component_t *c = &mca_rmaps_mindist_component.base_version;
> >     bool initial_map=true;
> >     bool bynode = false;
> > +    int ret;
> > 
> >     /* this mapper can only handle initial launch
> >      * when mindist mapping is desired
> > @@ -245,7 +246,13 @@
> >              * so we call opal_hwloc_base_get_nbobjs_by_type */
> >             opal_hwloc_base_get_nbobjs_by_type(node->topology, 
> > HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE);
> >             OBJ_CONSTRUCT(&numa_list, opal_list_t);
> > -            opal_hwloc_get_sorted_numa_list(node->topology, 
> > orte_rmaps_base.device, &numa_list);
> > +            ret = opal_hwloc_get_sorted_numa_list(node->topology, 
> > orte_rmaps_base.device, &numa_list);
> > +            if (ret > 1) {
> > +                orte_show_help("help-orte-rmaps-md.txt", 
> > "orte-rmaps-mindist:several-hca-devices",
> > +                        true, ret, node->name);
> > +                rc = ORTE_ERR_SILENT;
> > +                goto error;
> > +            }
> >             if (opal_list_get_size(&numa_list) > 0) {
> >                 j = 0;
> >                 required = 0;
> > @@ -390,7 +397,7 @@
> >         }
> >         OBJ_DESTRUCT(&node_list);
> >     }
> > -
> > +    free(orte_rmaps_base.device);
> >     return ORTE_SUCCESS;
> > 
> > error:
> > _______________________________________________
> > svn-full mailing list
> > svn-f...@open-mpi.org
> > http://www.open-mpi.org/mailman/listinfo.cgi/svn-full
> 
> 
> -- 
> Jeff Squyres
> jsquy...@cisco.com
> For corporate legal information go to: 
> http://www.cisco.com/web/about/doing_business/legal/cri/
> 
> _______________________________________________
> devel mailing list
> de...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/devel

Reply via email to