3rd question is as follows: (3) If the message of the same condition exists in two lists or more, an error occurs by assert(need <= found) in send_msg_details function. I built Open MPI with "--enable-debug" configure option.
Framework : crcp Component : bkmrk The source file : ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c The function name : send_msg_details,do_recv_msg_detail_check_drain Here's the code that causes the problem: #define BLOCKNUM 1 #define SLPTIM 60 if (rank == 0) { MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD); MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD); MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD); MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&sreq); MPI_Wait(&sreq,&ssts); MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&sreq); MPI_Wait(&sreq,&ssts); MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&sreq); MPI_Wait(&sreq,&ssts); MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&sreq); printf(" rank=%d sleep start \n",rank); fflush(stdout); sleep(SLPTIM); /** take checkpoint at this point **/ printf(" rank=%d sleep end \n",rank); fflush(stdout); MPI_Wait(&sreq,&ssts); } else { /* rank 1 */ MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts); MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts); MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts); MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts); MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts); MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts); printf(" rank=%d sleep start \n",rank); fflush(stdout); sleep(SLPTIM); /** take checkpoint at this point **/ printf(" rank=%d sleep end \n",rank); fflush(stdout); MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts); } * Take checkpoint while Process 0 and Process 1 are in sleep function * Here's the tag,elements,type,and communicator of the message; message tag=100,number of elements=1,data type=MPI_INT,communicator=MPI_COMM_WORLD * Send side(Rank 0): The information of the message of the same condition exists in both send_list and isend_list. * Recv side(Rank 1): The information of the message exists in irecv_list only. I wonder that there are some problems on messages matching in do_recv_msg_detail_check_drain function. * Result rank=0 size=2 rank=1 size=2 rank=0 sleep start rank=1 sleep start rank=0 sleep end rank=1 sleep end t_mpi_question-3.out: ../../../../../ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c:5471: send_msg_details: Assertion `need <= found' failed. [camel0:24606] *** Process received signal *** [camel0:24606] Signal: Aborted (6) [camel0:24606] Signal code: (-6) -bash-3.2$ cat t_mpi_question-3.c #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <string.h> #include <time.h> #include <limits.h> #include "mpi.h" #define BLOCKNUM 1 #define SLPTIM 60 int main(int ac,char **av) { int i; int rank,size; int *wbuf; int *rbuf; MPI_Status rsts,ssts; MPI_Request rreq,sreq; MPI_Init(&ac,&av); MPI_Comm_rank(MPI_COMM_WORLD,&rank); MPI_Comm_size(MPI_COMM_WORLD,&size); if (size != 2) { MPI_Abort(MPI_COMM_WORLD,-1); } rbuf = (int *)malloc(BLOCKNUM * sizeof(int)); wbuf = (int *)malloc(BLOCKNUM * sizeof(int)); if ((rbuf == NULL)||(wbuf == NULL)) { MPI_Abort(MPI_COMM_WORLD,-1); } printf(" rank=%d size=%d \n",rank,size); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD); if (rank == 0) { for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (100 + i); } MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD); for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (200 + i); } MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD); for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (300 + i); } MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD); for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (400 + i); } MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&sreq); MPI_Wait(&sreq,&ssts); for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (500 + i); } MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&sreq); MPI_Wait(&sreq,&ssts); for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (600 + i); } MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&sreq); MPI_Wait(&sreq,&ssts); for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (700 + i); } MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&sreq); printf(" rank=%d sleep start \n",rank); fflush(stdout); sleep(SLPTIM); printf(" rank=%d sleep end \n",rank); fflush(stdout); MPI_Wait(&sreq,&ssts); } else { for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; } MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts); for (i=0;i<BLOCKNUM;i++) { if (rbuf[i] != (100 + i)) { abort(); } } for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; } MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts); for (i=0;i<BLOCKNUM;i++) { if (rbuf[i] != (200 + i)) { abort(); } } for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; } MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts); for (i=0;i<BLOCKNUM;i++) { if (rbuf[i] != (300 + i)) { abort(); } } for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; } MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts); for (i=0;i<BLOCKNUM;i++) { if (rbuf[i] != (400 + i)) { abort(); } } for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; } MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts); for (i=0;i<BLOCKNUM;i++) { if (rbuf[i] != (500 + i)) { abort(); } } for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; } MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts); for (i=0;i<BLOCKNUM;i++) { if (rbuf[i] != (600 + i)) { abort(); } } printf(" rank=%d sleep start \n",rank); fflush(stdout); sleep(SLPTIM); printf(" rank=%d sleep end \n",rank); fflush(stdout); for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; } MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts); for (i=0;i<BLOCKNUM;i++) { if (rbuf[i] != (700 + i)) { abort(); } } } MPI_Barrier(MPI_COMM_WORLD); free(rbuf); free(wbuf); MPI_Finalize(); if (rank == 0) { printf(" rank=%d program end \n",rank); fflush(stdout); } return(0); }