Strange - it ran fine for me on multiple tests. I'll check to see if something 
strange got into the mix and recommit.

On Oct 17, 2011, at 8:51 PM, George Bosilca wrote:

> This commit put the mpirun process in an infinite loop for the simple case 
> mpirun -np 2 --mca orte_default_hostfile machinefile --bynode *my_app*
> 
>  george.
> 
> On Oct 17, 2011, at 15:49 , r...@osl.iu.edu wrote:
> 
>> Author: rhc
>> Date: 2011-10-17 15:49:04 EDT (Mon, 17 Oct 2011)
>> New Revision: 25302
>> URL: https://svn.open-mpi.org/trac/ompi/changeset/25302
>> 
>> Log:
>> Fix the mapping algo for computing vpids - it was borked for bynode 
>> operations when using nperxxx directives
>> 
>> Text files modified: 
>>  trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c |    67 
>> ++++++++++++++++++++------------------- 
>>  1 files changed, 34 insertions(+), 33 deletions(-)
>> 
>> Modified: trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c
>> ==============================================================================
>> --- trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c       (original)
>> +++ trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c       2011-10-17 
>> 15:49:04 EDT (Mon, 17 Oct 2011)
>> @@ -527,7 +527,7 @@
>> int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
>> {
>>    orte_job_map_t *map;
>> -    orte_vpid_t vpid;
>> +    orte_vpid_t vpid, cnt;
>>    int i, j;
>>    orte_node_t *node;
>>    orte_proc_t *proc;
>> @@ -539,6 +539,7 @@
>>        ORTE_MAPPING_BYSOCKET & map->policy ||
>>        ORTE_MAPPING_BYBOARD & map->policy) {
>>        /* assign the ranks sequentially */
>> +        vpid = 0;
>>        for (i=0; i < map->nodes->size; i++) {
>>            if (NULL == (node = 
>> (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
>>                continue;
>> @@ -553,12 +554,10 @@
>>                }
>>                if (ORTE_VPID_INVALID == proc->name.vpid) {
>>                    /* find the next available vpid */
>> -                    for (vpid=0; vpid < jdata->num_procs; vpid++) {
>> -                        if (NULL == 
>> opal_pointer_array_get_item(jdata->procs, vpid)) {
>> -                            break;
>> -                        }
>> +                    while (NULL != 
>> opal_pointer_array_get_item(jdata->procs, vpid)) {
>> +                        vpid++;
>>                    }
>> -                    proc->name.vpid = vpid;
>> +                    proc->name.vpid = vpid++;
>>                    ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
>>                    
>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
>> 
>> @@ -580,39 +579,41 @@
>> 
>>    if (ORTE_MAPPING_BYNODE & map->policy) {
>>        /* assign the ranks round-robin across nodes */
>> -        for (i=0; i < map->nodes->size; i++) {
>> -            if (NULL == (node = 
>> (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
>> -                continue;
>> -            }
>> -            for (j=0; j < node->procs->size; j++) {
>> -                if (NULL == (proc = 
>> (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
>> +        cnt = 0;
>> +        vpid = 0;
>> +        do {
>> +            for (i=0; i < map->nodes->size; i++) {
>> +                if (NULL == (node = 
>> (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
>>                    continue;
>>                }
>> -                /* ignore procs from other jobs */
>> -                if (proc->name.jobid != jdata->jobid) {
>> -                    continue;
>> -                }
>> -                if (ORTE_VPID_INVALID == proc->name.vpid) {
>> -                    /* find the next available vpid */
>> -                    vpid = i;
>> -                    while (NULL != 
>> opal_pointer_array_get_item(jdata->procs, vpid)) {
>> -                        vpid += map->num_nodes;
>> -                        if (jdata->num_procs <= vpid) {
>> -                            vpid = vpid - jdata->num_procs;
>> +                for (j=0; j < node->procs->size; j++) {
>> +                    if (NULL == (proc = 
>> (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
>> +                        continue;
>> +                    }
>> +                    /* ignore procs from other jobs */
>> +                    if (proc->name.jobid != jdata->jobid) {
>> +                        continue;
>> +                    }
>> +                    if (ORTE_VPID_INVALID == proc->name.vpid) {
>> +                        /* find next available vpid */
>> +                        while (NULL != 
>> opal_pointer_array_get_item(jdata->procs, vpid)) {
>> +                            vpid++;
>> +                        }
>> +                        proc->name.vpid = vpid++;
>> +                        ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
>> +                        
>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
>> +                        if (ORTE_SUCCESS != (rc = 
>> opal_pointer_array_set_item(jdata->procs,
>> +                                                                            
>>   proc->name.vpid, proc))) {
>> +                            ORTE_ERROR_LOG(rc);
>> +                            return rc;
>>                        }
>> +                        cnt++;
>> +                        break;  /* move to next node */
>>                    }
>> -                    proc->name.vpid = vpid;
>> -                    ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
>> -                    
>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
>> -                }
>> -                if (NULL == opal_pointer_array_get_item(jdata->procs, 
>> proc->name.vpid)) {
>> -                    if (ORTE_SUCCESS != (rc = 
>> opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
>> -                        ORTE_ERROR_LOG(rc);
>> -                        return rc;
>> -                    }                    
>>                }
>>            }
>> -        }
>> +        } while (cnt < jdata->num_procs);
>> +
>>        return ORTE_SUCCESS;
>>    }
>> 
>> _______________________________________________
>> svn mailing list
>> s...@open-mpi.org
>> http://www.open-mpi.org/mailman/listinfo.cgi/svn
> 
> 
> _______________________________________________
> devel mailing list
> de...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/devel


Reply via email to