This commit put the mpirun process in an infinite loop for the simple case mpirun -np 2 --mca orte_default_hostfile machinefile --bynode *my_app*
george. On Oct 17, 2011, at 15:49 , r...@osl.iu.edu wrote: > Author: rhc > Date: 2011-10-17 15:49:04 EDT (Mon, 17 Oct 2011) > New Revision: 25302 > URL: https://svn.open-mpi.org/trac/ompi/changeset/25302 > > Log: > Fix the mapping algo for computing vpids - it was borked for bynode > operations when using nperxxx directives > > Text files modified: > trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c | 67 > ++++++++++++++++++++------------------- > 1 files changed, 34 insertions(+), 33 deletions(-) > > Modified: trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c > ============================================================================== > --- trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c (original) > +++ trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c 2011-10-17 > 15:49:04 EDT (Mon, 17 Oct 2011) > @@ -527,7 +527,7 @@ > int orte_rmaps_base_compute_vpids(orte_job_t *jdata) > { > orte_job_map_t *map; > - orte_vpid_t vpid; > + orte_vpid_t vpid, cnt; > int i, j; > orte_node_t *node; > orte_proc_t *proc; > @@ -539,6 +539,7 @@ > ORTE_MAPPING_BYSOCKET & map->policy || > ORTE_MAPPING_BYBOARD & map->policy) { > /* assign the ranks sequentially */ > + vpid = 0; > for (i=0; i < map->nodes->size; i++) { > if (NULL == (node = > (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { > continue; > @@ -553,12 +554,10 @@ > } > if (ORTE_VPID_INVALID == proc->name.vpid) { > /* find the next available vpid */ > - for (vpid=0; vpid < jdata->num_procs; vpid++) { > - if (NULL == > opal_pointer_array_get_item(jdata->procs, vpid)) { > - break; > - } > + while (NULL != opal_pointer_array_get_item(jdata->procs, > vpid)) { > + vpid++; > } > - proc->name.vpid = vpid; > + proc->name.vpid = vpid++; > ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); > > ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); > > @@ -580,39 +579,41 @@ > > if (ORTE_MAPPING_BYNODE & map->policy) { > /* assign the ranks round-robin across nodes */ > - for (i=0; i < map->nodes->size; i++) { > - if (NULL == (node = > (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { > - continue; > - } > - for (j=0; j < node->procs->size; j++) { > - if (NULL == (proc = > (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { > + cnt = 0; > + vpid = 0; > + do { > + for (i=0; i < map->nodes->size; i++) { > + if (NULL == (node = > (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { > continue; > } > - /* ignore procs from other jobs */ > - if (proc->name.jobid != jdata->jobid) { > - continue; > - } > - if (ORTE_VPID_INVALID == proc->name.vpid) { > - /* find the next available vpid */ > - vpid = i; > - while (NULL != opal_pointer_array_get_item(jdata->procs, > vpid)) { > - vpid += map->num_nodes; > - if (jdata->num_procs <= vpid) { > - vpid = vpid - jdata->num_procs; > + for (j=0; j < node->procs->size; j++) { > + if (NULL == (proc = > (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { > + continue; > + } > + /* ignore procs from other jobs */ > + if (proc->name.jobid != jdata->jobid) { > + continue; > + } > + if (ORTE_VPID_INVALID == proc->name.vpid) { > + /* find next available vpid */ > + while (NULL != > opal_pointer_array_get_item(jdata->procs, vpid)) { > + vpid++; > + } > + proc->name.vpid = vpid++; > + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); > + > ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); > + if (ORTE_SUCCESS != (rc = > opal_pointer_array_set_item(jdata->procs, > + > proc->name.vpid, proc))) { > + ORTE_ERROR_LOG(rc); > + return rc; > } > + cnt++; > + break; /* move to next node */ > } > - proc->name.vpid = vpid; > - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); > - > ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); > - } > - if (NULL == opal_pointer_array_get_item(jdata->procs, > proc->name.vpid)) { > - if (ORTE_SUCCESS != (rc = > opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { > - ORTE_ERROR_LOG(rc); > - return rc; > - } > } > } > - } > + } while (cnt < jdata->num_procs); > + > return ORTE_SUCCESS; > } > > _______________________________________________ > svn mailing list > s...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/svn