5th question is as follows: (5) Receving with MPI_ANY_SOURCE,MPI_ANY_TAG.
This problem might have already been announced by the Open MPI Trac with "Ticket #1769". This problem will occur with usual MPI program. Framework : crcp Component : bkmrk The source file : ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c The function name : do_recv_msg_detail_check_drain,traffic_message_find Here's the code that causes the problem: #define BLOCKNUM 1048576 #define SLPTIM 60 if (rank == 0) { MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD); MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD); MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD); MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,200,MPI_COMM_WORLD,&sreq[5]); printf(" rank=%d sleep start \n",rank); fflush(stdout); sleep(SLPTIM); /** take checkpoint at this point **/ printf(" rank=%d sleep end \n",rank); fflush(stdout); MPI_Wait(&sreq[5],&ssts[5]); } else { /* rank 1 */ MPI_Recv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rsts[2]); MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&rreq[3]); MPI_Wait(&rreq[3],&rsts[3]); MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq[4]); MPI_Wait(&rreq[4],&rsts[4]); printf(" rank=%d sleep start \n",rank); fflush(stdout); sleep(SLPTIM); /** take checkpoint at this point **/ printf(" rank=%d sleep end \n",rank); fflush(stdout); MPI_Recv(rbuf,BLOCKNUM,MPI_INT,0,200,MPI_COMM_WORLD,&rsts[5]); } * Take checkpoint while Rank 0 and Rank 1 are performing sleep function * There are two messages which are considered to be an same condition by MPI_ANY_SOURCE,MPI_ANY_TAG in irecv_list. It is as follows: [IRECV=1e44a00 comm_id=6019e0/0/1 msgid=6 count=1048576 tag=100 rank=0 proc_name=-833290239/0 matched=0 done=1 active=0 drain=0 ] [c=0 r=1] [IRECV=1e44b80 comm_id=6019e0/0/1 msgid=5 count=1048576 tag=-1 rank=-1 proc_name=-833290239/0 matched=0 done=1 active=0 drain=0 ] [c=0 r=1] * However, do_recv_msg_detail_check_drain function obtains either message information via traffic_message_find. * Therefore, the other message information can not be obtained. -bash-3.2$ cat t_mpi_question-5.c #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include "mpi.h" #define BLOCKNUM 1048576 #define SLPTIM 60 int main(int ac,char **av) { int i; int rank,size; int *wbuf; int *rbuf; MPI_Status rsts[4],ssts[4]; MPI_Request rreq[4],sreq[4]; int bufsize,count; MPI_Init(&ac,&av); MPI_Comm_rank(MPI_COMM_WORLD,&rank); MPI_Comm_size(MPI_COMM_WORLD,&size); printf(" rank=%d size=%d \n",rank,size); fflush(stdout); if (size != 2) { MPI_Abort(MPI_COMM_WORLD,-1); } rbuf = (int *)malloc(BLOCKNUM * sizeof(int)); wbuf = (int *)malloc(BLOCKNUM * sizeof(int)); if ((rbuf == NULL)||(wbuf == NULL)) { MPI_Abort(MPI_COMM_WORLD,-1); } printf(" rank=%d pass-1 \n",rank); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD); if (rank == 0) { for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (100+i); } MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD); for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (200+i); } MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD); for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (300+i); } MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD); for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (400+i); } MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,200,MPI_COMM_WORLD,&sreq[0]); printf(" rank=%d sleep start \n",rank); fflush(stdout); sleep(SLPTIM); printf(" rank=%d sleep end \n",rank); fflush(stdout); MPI_Wait(&sreq[0],&ssts[0]); } else { for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; } MPI_Recv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rsts[0]); for (i=0;i<BLOCKNUM;i++) { if(rbuf[i] != (100+i)) { MPI_Abort(MPI_COMM_WORLD,1); } } for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; } MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&rreq[1]); MPI_Wait(&rreq[1],&rsts[0]); for (i=0;i<BLOCKNUM;i++) { if(rbuf[i] != (200+i)) { MPI_Abort(MPI_COMM_WORLD,1); } } for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; } MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq[2]); MPI_Wait(&rreq[2],&rsts[2]); for (i=0;i<BLOCKNUM;i++) { if(rbuf[i] != (300+i)) { MPI_Abort(MPI_COMM_WORLD,1); } } printf(" rank=%d sleep start \n",rank); fflush(stdout); sleep(SLPTIM); printf(" rank=%d sleep end \n",rank); fflush(stdout); for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; } MPI_Recv(rbuf,BLOCKNUM,MPI_INT,0,200,MPI_COMM_WORLD,&rsts[3]); for (i=0;i<BLOCKNUM;i++) { if(rbuf[i] != (400+i)) { MPI_Abort(MPI_COMM_WORLD,1); } } } MPI_Barrier(MPI_COMM_WORLD); free(rbuf); free(wbuf); MPI_Finalize(); if (rank == 0) { printf(" rank=%d program end \n",rank); fflush(stdout); fflush(stderr); } return(0); }