11th question is as follows:
(11) The communication which uses inter-communicator deadlocks after
taking checkpoint.
Framework : crcp
Component : bkmrk
The source file : ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
The function name : :drain_message_find_any
Here's the code that causes the problem:
#define SLPTIME 60
buf = -1;
if (rank == 0) {
buf = 9014;
MPI_Isend(&buf,1,MPI_INT,0,1000,intercomm,&req); /* using inter-
communicator */
printf(" rank=%d sleep start \n",rank); fflush(stdout);
sleep(SLPTIME); /** take checkpoint at this point **/
printf(" rank=%d sleep end \n",rank); fflush(stdout);
MPI_Wait(&req,&sts);
}
else if (rank==1) {
printf(" rank=%d sleep start \n",rank); fflush(stdout);
sleep(SLPTIME); /** take checkpoint at this point **/
printf(" rank=%d sleep end \n",rank); fflush(stdout);
buf = 0;
MPI_Irecv(&buf,1,MPI_INT,0,1000,intercomm,&req); /* using inter-
communicator */
MPI_Wait(&req,&sts);
}
* Take checkpoint while Process 0 and Process 1 are in sleep function,
then MPI program deadlocks.
* Here's my debugging output.
ft_event_post_drain_message:Irecv drain_msg_ref=8a2f80 rank=0
tag=1000 cnt=1 ddt=4 to=8c27c0 [datatype->size=1]
wait_quiesce_drained:xx=0 9014
drain_message_find_any:Compare[peer=0] vpid=0 1 jobid=-431423487
-431423487 grp_proc_count=1 89cea0 1
drain_message_find_any:Compare[peer=0] -> Continue
* Because matching of vpid,jobid by orte_util_compare_name_fields is
failed,
drain_message_find_any function does not call drain_message_find.
And received messages in bkmrk is not found.
Is orte_util_compare_name_fields function corresponding to inter-
communicator?
-bash-3.2$ cat t_mpi_question-11.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"
#define SLPTIME 60
int main(int ac,char **av)
{
int rank,size,cc,j,i,buf;
MPI_Request req;
MPI_Status sts;
MPI_Comm localcomm,intercomm;
MPI_Group worldgrp,localgrp;
int local_grp_size,localrank,localsize,interrank,intersize;
int *rank_list;
int local_leader,remote_leader;
rank=0;
MPI_Init(&ac,&av);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Comm_size(MPI_COMM_WORLD,&size);
if (size%2 != 0) { MPI_Abort(MPI_COMM_WORLD,-1); }
printf(" rank=%d pass-1 \n",rank); fflush(stdout);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Comm_group(MPI_COMM_WORLD,&worldgrp);
local_grp_size = size / 2;
rank_list = (int *)malloc(sizeof(int) * local_grp_size);
if (rank_list == NULL) { MPI_Abort(MPI_COMM_WORLD,-1); }
j = ((rank % 2) == 0) ? 0 : 1;
for (i=0;i<local_grp_size;i++) {
rank_list[i] = j;
j+=2;
}
MPI_Group_incl(worldgrp,local_grp_size,rank_list,&localgrp);
MPI_Comm_create(MPI_COMM_WORLD,localgrp,&localcomm);
MPI_Comm_rank(localcomm,&localrank);
MPI_Comm_size(localcomm,&localsize);
printf(" rank=%d size=%d pass-3 LOCAL rank=%d size=%d \n"
,rank,size,localrank,localsize);
fflush(stdout);
MPI_Barrier(localcomm);
MPI_Barrier(MPI_COMM_WORLD);
local_leader = 0;
remote_leader = ((rank % 2) == 0) ? 1 : 0;
MPI_Intercomm_create(localcomm,local_leader,MPI_COMM_WORLD,
remote_leader,999,&intercomm);
MPI_Comm_rank(intercomm,&interrank);
MPI_Comm_size(intercomm,&intersize);
printf(" rank=%d size=%d pass-4 LOCAL rank=%d size=%d INTER rank=
%d size=%d \n"
,rank,size,localrank,localsize,interrank,intersize);
fflush(stdout);
MPI_Barrier(intercomm);
MPI_Barrier(localcomm);
MPI_Barrier(MPI_COMM_WORLD);
buf = -1;
if (rank == 0) {
buf = 9014;
MPI_Isend(&buf,1,MPI_INT,0,1000,intercomm,&req);
printf(" rank=%d sleep start \n",rank); fflush(stdout);
sleep(SLPTIME);
printf(" rank=%d sleep end \n",rank); fflush(stdout);
MPI_Wait(&req,&sts);
}
else if (rank==1) {
printf(" rank=%d sleep start \n",rank); fflush(stdout);
sleep(SLPTIME);
printf(" rank=%d sleep end \n",rank); fflush(stdout);
buf = 0;
MPI_Irecv(&buf,1,MPI_INT,0,1000,intercomm,&req);
MPI_Wait(&req,&sts);
}
printf(" rank=%d pass-5 buf=%d \n",rank,buf); fflush(stdout);
MPI_Barrier(intercomm);
MPI_Barrier(localcomm);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Comm_free(&intercomm);
MPI_Comm_free(&localcomm);
MPI_Group_free(&localgrp);
MPI_Finalize();
if (rank ==0) {
printf(" rank=%d program end \n",rank); fflush(stdout);
}
return(0);
}
_______________________________________________
devel mailing list
de...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel