Strange - it ran fine for me on multiple tests. I'll check to see if something strange got into the mix and recommit.
On Oct 17, 2011, at 8:51 PM, George Bosilca wrote: > This commit put the mpirun process in an infinite loop for the simple case > mpirun -np 2 --mca orte_default_hostfile machinefile --bynode *my_app* > > george. > > On Oct 17, 2011, at 15:49 , r...@osl.iu.edu wrote: > >> Author: rhc >> Date: 2011-10-17 15:49:04 EDT (Mon, 17 Oct 2011) >> New Revision: 25302 >> URL: https://svn.open-mpi.org/trac/ompi/changeset/25302 >> >> Log: >> Fix the mapping algo for computing vpids - it was borked for bynode >> operations when using nperxxx directives >> >> Text files modified: >> trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c | 67 >> ++++++++++++++++++++------------------- >> 1 files changed, 34 insertions(+), 33 deletions(-) >> >> Modified: trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c >> ============================================================================== >> --- trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c (original) >> +++ trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c 2011-10-17 >> 15:49:04 EDT (Mon, 17 Oct 2011) >> @@ -527,7 +527,7 @@ >> int orte_rmaps_base_compute_vpids(orte_job_t *jdata) >> { >> orte_job_map_t *map; >> - orte_vpid_t vpid; >> + orte_vpid_t vpid, cnt; >> int i, j; >> orte_node_t *node; >> orte_proc_t *proc; >> @@ -539,6 +539,7 @@ >> ORTE_MAPPING_BYSOCKET & map->policy || >> ORTE_MAPPING_BYBOARD & map->policy) { >> /* assign the ranks sequentially */ >> + vpid = 0; >> for (i=0; i < map->nodes->size; i++) { >> if (NULL == (node = >> (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { >> continue; >> @@ -553,12 +554,10 @@ >> } >> if (ORTE_VPID_INVALID == proc->name.vpid) { >> /* find the next available vpid */ >> - for (vpid=0; vpid < jdata->num_procs; vpid++) { >> - if (NULL == >> opal_pointer_array_get_item(jdata->procs, vpid)) { >> - break; >> - } >> + while (NULL != >> opal_pointer_array_get_item(jdata->procs, vpid)) { >> + vpid++; >> } >> - proc->name.vpid = vpid; >> + proc->name.vpid = vpid++; >> ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); >> >> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >> >> @@ -580,39 +579,41 @@ >> >> if (ORTE_MAPPING_BYNODE & map->policy) { >> /* assign the ranks round-robin across nodes */ >> - for (i=0; i < map->nodes->size; i++) { >> - if (NULL == (node = >> (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { >> - continue; >> - } >> - for (j=0; j < node->procs->size; j++) { >> - if (NULL == (proc = >> (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { >> + cnt = 0; >> + vpid = 0; >> + do { >> + for (i=0; i < map->nodes->size; i++) { >> + if (NULL == (node = >> (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { >> continue; >> } >> - /* ignore procs from other jobs */ >> - if (proc->name.jobid != jdata->jobid) { >> - continue; >> - } >> - if (ORTE_VPID_INVALID == proc->name.vpid) { >> - /* find the next available vpid */ >> - vpid = i; >> - while (NULL != >> opal_pointer_array_get_item(jdata->procs, vpid)) { >> - vpid += map->num_nodes; >> - if (jdata->num_procs <= vpid) { >> - vpid = vpid - jdata->num_procs; >> + for (j=0; j < node->procs->size; j++) { >> + if (NULL == (proc = >> (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { >> + continue; >> + } >> + /* ignore procs from other jobs */ >> + if (proc->name.jobid != jdata->jobid) { >> + continue; >> + } >> + if (ORTE_VPID_INVALID == proc->name.vpid) { >> + /* find next available vpid */ >> + while (NULL != >> opal_pointer_array_get_item(jdata->procs, vpid)) { >> + vpid++; >> + } >> + proc->name.vpid = vpid++; >> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); >> + >> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >> + if (ORTE_SUCCESS != (rc = >> opal_pointer_array_set_item(jdata->procs, >> + >> proc->name.vpid, proc))) { >> + ORTE_ERROR_LOG(rc); >> + return rc; >> } >> + cnt++; >> + break; /* move to next node */ >> } >> - proc->name.vpid = vpid; >> - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); >> - >> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >> - } >> - if (NULL == opal_pointer_array_get_item(jdata->procs, >> proc->name.vpid)) { >> - if (ORTE_SUCCESS != (rc = >> opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { >> - ORTE_ERROR_LOG(rc); >> - return rc; >> - } >> } >> } >> - } >> + } while (cnt < jdata->num_procs); >> + >> return ORTE_SUCCESS; >> } >> >> _______________________________________________ >> svn mailing list >> s...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/svn > > > _______________________________________________ > devel mailing list > de...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/devel