I'm trying checkpoint/restart of Open MPI.
I'm using Open MPI 1.4.1 and BLCR 0.8.2.
But it doesn't work well.
I'm looking into the source code.
And I have some questions about checkpoint/restart.
Could anyone answer my questions ?
I will give them one by one.
My 1st question is as follows:
(1) Clearing the send_init_list, recv_init_list.
Framework : crcp
Component : bkmrk
The source file : ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
The function name : ft_event_finalize_exchange
I found the comment /* Clear send_init_list */ in ft_event_finalize_exchange
function.
However, the corresponding source code seems to be not clearing send_init_list
but clearing send_list.
Source code is as follows:
/* Clear send_init_list */
for(rm_item = opal_list_get_last(&peer_ref->send_list);
rm_item != opal_list_get_begin(&peer_ref->send_list);
Is it correct?
Send_list seems to be already cleared by this point.
And, Clearing recv_init_list is in the same.
The comment is /* Clear recv_init_list */.
However, the corresponding source code seems to be not clearing recv_init_list
but clearing recv_list.
Recv_list seems to be already cleared by this point.
Source code is as follows:
/* Clear recv_init_list */
for(rm_item = opal_list_get_last(&peer_ref->recv_list);
rm_item != opal_list_get_begin(&peer_ref->recv_list);
Here's the code that causes the problem:
#define BLOCKNUM 1
#define SLPTIM 60
if (rank == 0) {
MPI_Send_init(&buf[0],BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&req1);
MPI_Start(&req1); MPI_Wait(&req1,&sts1);
MPI_Start(&req1); MPI_Wait(&req1,&sts1);
printf(" rank=%d sleep1 start \n",rank); fflush(stdout);
sleep(SLPTIM); /** take checkpoint(1st time) **/
printf(" rank=%d sleep1 end \n",rank); fflush(stdout);
MPI_Start(&req1);
printf(" rank=%d sleep2 start \n",rank); fflush(stdout);
sleep(SLPTIM); /** take checkpoint(2nd time), and deadlock occurs. **/
printf(" rank=%d sleep2 end \n",rank); fflush(stdout);
MPI_Wait(&req1,&sts1);
MPI_Request_free(&req1);
} else { /* rank 1 */
MPI_Recv_init(&buf[0],BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&req1);
MPI_Start(&req1); MPI_Wait(&req1,&sts1);
printf(" rank=%d sleep1 start \n",rank); fflush(stdout);
sleep(SLPTIM); /** take checkpoint(1st time) **/
printf(" rank=%d sleep1 end \n",rank); fflush(stdout);
MPI_Start(&req1); MPI_Wait(&req1,&sts1);
printf(" rank=%d sleep2 start \n",rank); fflush(stdout);
sleep(SLPTIM); /** take checkpoint(2nd time), and deadlock occurs. **/
printf(" rank=%d sleep2 end \n",rank); fflush(stdout);
MPI_Start(&req1); MPI_Wait(&req1,&sts1);
MPI_Request_free(&req1);
}
* Take checkpoint twice.
* Take checkpoint while Process 0 is in MPI_Send function and Process 1 is in
sleep function
* Deadlock occurs when checkpoint is taken at the second time.
* Here's my debugging output.
rank=1 pass-1 100
rank=1 sleep1 start /* 1st checkpoint */
rank=0 sleep1 start /* 1st checkpoint */
rank=1 sleep1 end
rank=0 sleep1 end
DEBUG:do_recv_msg_detail_check_drain p_num_sent(from sender)=1 /* MPI_Barrier
*/
DEBUG:do_recv_msg_detail_check_drain p_num_sent(from sender)=2
DEBUG:do_recv_msg_detail_check_drain posted_precv_msg_ref BEFORE-UPDATE
matched=0 done=1 num_left_unresolved=2
DEBUG:do_recv_msg_detail_check_drain posted_precv_msg_ref AFTER-UPDATE
matched=1 done=1 num_left_unresolved=1
DEBUG: num_left_unresolved=1 goto cleapup
DEBUG:ft_event_post_drain_message calls wrapped_pml_module->pml_irecv tag=100
count=1 ddt_size=4
rank=1 pass-2 200
rank=1 sleep2 start /* 2nd checkpoint */
rank=0 sleep2 start /* 2nd checkpoint */
rank=1 sleep2 end
rank=0 sleep2 end
DEBUG:do_recv_msg_detail_check_drain p_num_sent(from sender)=3
/* Sender sent the wrong value("3"). I think the correct value may be "1". */
DEBUG:do_recv_msg_detail_check_drain posted_precv_msg_ref BEFORE-UPDATE
matched=1 done=1 num_left_unresolved=3
DEBUG:do_recv_msg_detail_check_drain posted_precv_msg_ref AFTER-UPDATE
matched=2 done=1 num_left_unresolved=2
/* The wrong values are set in recv_init_list of receiver,
Because recv_init_list was not cleared in taking checkpoint in the first
time. */
DEBUG: num_left_unresolved=2 goto cleapup
DEBUG:ft_event_post_drain_message calls wrapped_pml_module->pml_irecv tag=100
count=1 ddt_size=4
DEBUG:ft_event_post_drain_message calls wrapped_pml_module->pml_irecv tag=100
count=1 ddt_size=4
/* the wrong receiving is issued. */
-bash-3.2$ cat t_mpi_question-1.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"
#define BLOCKNUM 1
#define SLPTIM 60
int main(int ac,char **av)
{
int i,k,rank,size,cc;
int *buf;
MPI_Status sts1;
MPI_Request req1;
MPI_Init(&ac,&av);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Comm_size(MPI_COMM_WORLD,&size);
if (size != 2) { MPI_Abort(MPI_COMM_WORLD,-1); }
buf = (int *)malloc(sizeof(int)*BLOCKNUM);
if (buf == NULL) { MPI_Abort(MPI_COMM_WORLD,-1); }
MPI_Barrier(MPI_COMM_WORLD);
if (rank == 0) {
MPI_Send_init(&buf[0],BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&req1);
for (i=0;i<BLOCKNUM;i++) { buf[i] = (100+i); }
MPI_Start(&req1); MPI_Wait(&req1,&sts1);
for (i=0;i<BLOCKNUM;i++) { buf[i] = (200+i); }
MPI_Start(&req1); MPI_Wait(&req1,&sts1);
printf(" rank=%d sleep1 start \n",rank); fflush(stdout);
sleep(SLPTIM);
printf(" rank=%d sleep1 end \n",rank); fflush(stdout);
for (i=0;i<BLOCKNUM;i++) { buf[i] = (300+i); }
MPI_Start(&req1);
printf(" rank=%d sleep2 start \n",rank); fflush(stdout);
sleep(SLPTIM);
printf(" rank=%d sleep2 end \n",rank); fflush(stdout);
MPI_Wait(&req1,&sts1);
MPI_Request_free(&req1);
} else {
MPI_Recv_init(&buf[0],BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&req1);
for (i=0;i<BLOCKNUM;i++) { buf[i] = 0; }
MPI_Start(&req1); MPI_Wait(&req1,&sts1);
printf(" rank=%d pass-1 %d \n",rank,buf[0]); fflush(stdout);
printf(" rank=%d sleep1 start \n",rank); fflush(stdout);
sleep(SLPTIM);
printf(" rank=%d sleep1 end \n",rank); fflush(stdout);
for (i=0;i<BLOCKNUM;i++) { buf[i] = 0; }
MPI_Start(&req1); MPI_Wait(&req1,&sts1);
printf(" rank=%d pass-2 %d \n",rank,buf[0]); fflush(stdout);
printf(" rank=%d sleep2 start \n",rank); fflush(stdout);
sleep(SLPTIM);
printf(" rank=%d sleep2 end \n",rank); fflush(stdout);
for (i=0;i<BLOCKNUM;i++) { buf[i] = 0; }
MPI_Start(&req1); MPI_Wait(&req1,&sts1);
printf(" rank=%d pass-3 %d \n",rank,buf[0]); fflush(stdout);
MPI_Request_free(&req1);
}
MPI_Barrier(MPI_COMM_WORLD);
free(buf);
MPI_Finalize();
if (rank == 0) {
printf(" rank=%d Program End \n",rank); fflush(stdout);
}
return(0);
}