I just wanted to followup of this thread. I filed a ticket with all of these issues since many of them are potential bugs that should be fixed for v1.5 (and v1.4 if possible). The link to the ticket is below if you wanted to follow the progress:
  https://svn.open-mpi.org/trac/ompi/ticket/2397

Thanks again for the bug report. Sorry that it has taken me so long to get back to it (Dissertation writing really takes the wind out of ones sails). I hope to make forward progress on these and other C/R related bugs and feature requests over the next month or so.

-- Josh

On Apr 2, 2010, at 1:14 AM, Takayuki Seki wrote:


11th question is as follows:

(11) The communication which uses inter-communicator deadlocks after taking checkpoint.

Framework         : crcp
Component         : bkmrk
The source file   : ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
The function name : :drain_message_find_any

Here's the code that causes the problem:

#define SLPTIME 60

 buf = -1;
 if (rank == 0) {
   buf = 9014;
MPI_Isend(&buf,1,MPI_INT,0,1000,intercomm,&req); /* using inter- communicator */

   printf(" rank=%d sleep start \n",rank); fflush(stdout);
   sleep(SLPTIME);  /** take checkpoint at this point **/
   printf(" rank=%d sleep end   \n",rank); fflush(stdout);

   MPI_Wait(&req,&sts);
 }
 else if (rank==1) {
   printf(" rank=%d sleep start \n",rank); fflush(stdout);
   sleep(SLPTIME);  /** take checkpoint at this point **/
   printf(" rank=%d sleep end   \n",rank); fflush(stdout);

   buf = 0;
MPI_Irecv(&buf,1,MPI_INT,0,1000,intercomm,&req); /* using inter- communicator */
   MPI_Wait(&req,&sts);
 }

* Take checkpoint while Process 0 and Process 1 are in sleep function,
 then MPI program deadlocks.

* Here's my debugging output.
ft_event_post_drain_message:Irecv drain_msg_ref=8a2f80 rank=0 tag=1000 cnt=1 ddt=4 to=8c27c0 [datatype->size=1]
 wait_quiesce_drained:xx=0 9014
drain_message_find_any:Compare[peer=0] vpid=0 1 jobid=-431423487 -431423487 grp_proc_count=1 89cea0 1
 drain_message_find_any:Compare[peer=0] -> Continue

* Because matching of vpid,jobid by orte_util_compare_name_fields is failed,
 drain_message_find_any function does not call drain_message_find.
 And received messages in bkmrk is not found.
Is orte_util_compare_name_fields function corresponding to inter- communicator?


-bash-3.2$ cat t_mpi_question-11.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"

#define SLPTIME 60

int main(int ac,char **av)
{
 int rank,size,cc,j,i,buf;
 MPI_Request req;
 MPI_Status sts;
 MPI_Comm localcomm,intercomm;
 MPI_Group worldgrp,localgrp;
 int local_grp_size,localrank,localsize,interrank,intersize;
 int *rank_list;
 int local_leader,remote_leader;

 rank=0;
 MPI_Init(&ac,&av);
 MPI_Comm_rank(MPI_COMM_WORLD,&rank);
 MPI_Comm_size(MPI_COMM_WORLD,&size);
 if (size%2 != 0) { MPI_Abort(MPI_COMM_WORLD,-1); }

 printf("   rank=%d pass-1 \n",rank); fflush(stdout);
 MPI_Barrier(MPI_COMM_WORLD);

 MPI_Comm_group(MPI_COMM_WORLD,&worldgrp);

 local_grp_size = size / 2;
 rank_list = (int *)malloc(sizeof(int) * local_grp_size);
 if (rank_list == NULL) { MPI_Abort(MPI_COMM_WORLD,-1); }

 j = ((rank % 2) == 0) ? 0 : 1;
 for (i=0;i<local_grp_size;i++) {
   rank_list[i] = j;
   j+=2;
 }

 MPI_Group_incl(worldgrp,local_grp_size,rank_list,&localgrp);
 MPI_Comm_create(MPI_COMM_WORLD,localgrp,&localcomm);

 MPI_Comm_rank(localcomm,&localrank);
 MPI_Comm_size(localcomm,&localsize);

 printf("   rank=%d size=%d pass-3 LOCAL rank=%d size=%d \n"
   ,rank,size,localrank,localsize);
 fflush(stdout);
 MPI_Barrier(localcomm);
 MPI_Barrier(MPI_COMM_WORLD);

 local_leader  = 0;
 remote_leader = ((rank % 2) == 0) ? 1 : 0;
 MPI_Intercomm_create(localcomm,local_leader,MPI_COMM_WORLD,
                           remote_leader,999,&intercomm);

 MPI_Comm_rank(intercomm,&interrank);
 MPI_Comm_size(intercomm,&intersize);

printf(" rank=%d size=%d pass-4 LOCAL rank=%d size=%d INTER rank= %d size=%d \n"
   ,rank,size,localrank,localsize,interrank,intersize);
 fflush(stdout);

 MPI_Barrier(intercomm);
 MPI_Barrier(localcomm);
 MPI_Barrier(MPI_COMM_WORLD);

 buf = -1;
 if (rank == 0) {
   buf = 9014;
   MPI_Isend(&buf,1,MPI_INT,0,1000,intercomm,&req);

   printf(" rank=%d sleep start \n",rank); fflush(stdout);
   sleep(SLPTIME);
   printf(" rank=%d sleep end   \n",rank); fflush(stdout);

   MPI_Wait(&req,&sts);
 }
 else if (rank==1) {
   printf(" rank=%d sleep start \n",rank); fflush(stdout);
   sleep(SLPTIME);
   printf(" rank=%d sleep end   \n",rank); fflush(stdout);

   buf = 0;
   MPI_Irecv(&buf,1,MPI_INT,0,1000,intercomm,&req);
   MPI_Wait(&req,&sts);
 }
 printf("   rank=%d pass-5 buf=%d \n",rank,buf); fflush(stdout);

 MPI_Barrier(intercomm);
 MPI_Barrier(localcomm);
 MPI_Barrier(MPI_COMM_WORLD);

 MPI_Comm_free(&intercomm);
 MPI_Comm_free(&localcomm);
 MPI_Group_free(&localgrp);
 MPI_Finalize();
 if (rank ==0) {
   printf("   rank=%d program end \n",rank); fflush(stdout);
 }
 return(0);
}


_______________________________________________
devel mailing list
de...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel

Reply via email to