Hi Mike, In order to have the fork support enabled you need to set an additional ENV. See Section 7.1.2 in the User Guide for more information:
http://mvapich.cse.ohio-state.edu/support/mvapich_user_guide.html#x1-350007.1.2 Thanks, Matt On Wed, 12 Nov 2008, Mike Heinz wrote: > I'm not sure when this stopped working, but I'm getting a complaint from > our QA people that our fork() test program is failing with mvapich1 and > mvapich2 when tested with OFED 1.4. When I tested with OFED 1.3.1, I got > a similar result: > > > [EMAIL PROTECTED] mpi_fork]$ mpirun_rsh -np 2 panic homer mpi_fork 128 1024 > Exit code -3 signaled from homer > Abort signaled by rank 0: [panic:0] Got completion with error > IBV_WC_LOC_LEN_ERR, code=1, dest rank=1 > > Killing remote processes...MPI process terminated unexpectedly > DONE > > > This is the program that generates the failure: > > #include <stdlib.h> > #include <math.h> > #include <assert.h> > #include <sys/wait.h> > > > #define MYBUFSIZE (4*1024*1028) > #define MAX_REQ_NUM 100000 > > char s_buf1[MYBUFSIZE]; > char r_buf1[MYBUFSIZE]; > > > MPI_Request request[MAX_REQ_NUM]; > MPI_Status my_stat[MAX_REQ_NUM]; > > int main(int argc,char *argv[]) > { > int myid, numprocs, i; > int size, loop, page_size; > char *s_buf, *r_buf; > double t_start=0.0, t_end=0.0, t=0.0; > > > MPI_Init(&argc,&argv); > MPI_Comm_size(MPI_COMM_WORLD,&numprocs); > MPI_Comm_rank(MPI_COMM_WORLD,&myid); > > if ( argc < 3 ) { > fprintf(stderr, "Usage: mpi_fork loop msg_size\n"); > MPI_Finalize(); > return 0; > } > size=atoi(argv[2]); > loop = atoi(argv[1]); > > if(size > MYBUFSIZE){ > fprintf(stderr, "Maximum message size is %d\n",MYBUFSIZE); > MPI_Finalize(); > return 0; > } > > if(loop > MAX_REQ_NUM){ > fprintf(stderr, "Maximum number of iterations is > %d\n",MAX_REQ_NUM); > MPI_Finalize(); > return 0; > } > > page_size = getpagesize(); > > s_buf = (char*)(((unsigned long)s_buf1 + (page_size -1))/page_size * > page_size); > r_buf = (char*)(((unsigned long)r_buf1 + (page_size -1))/page_size * > page_size); > > assert( (s_buf != NULL) && (r_buf != NULL) ); > > for ( i=0; i<size; i++ ){ > s_buf[i]='a'; > r_buf[i]='b'; > } > > /*warmup */ > if (myid == 0) > { > for ( i=0; i< loop; i++ ) { > MPI_Isend(s_buf, size, MPI_CHAR, 1, 100, MPI_COMM_WORLD, > request+i); > } > > MPI_Waitall(loop, request, my_stat); > MPI_Recv(r_buf, 4, MPI_CHAR, 1, 101, MPI_COMM_WORLD, > &my_stat[0]); > > }else{ > for ( i=0; i< loop; i++ ) { > MPI_Irecv(r_buf, size, MPI_CHAR, 0, 100, MPI_COMM_WORLD, > request+i); > } > MPI_Waitall(loop, request, my_stat); > MPI_Send(s_buf, 4, MPI_CHAR, 0, 101, MPI_COMM_WORLD); > } > // fork a child process and make sure it lives beyond parent > touching pages > // if fork is not properly handled in stack, parent would get a copy > // of its registered/locked pages (such as qp wqes) on 1st access > // and problems such as Local Length Error would be reported by HCA > if (fork() == 0) { > // child exists but doesn't touch anything, parent still owns > pages > sleep(10); > // exec another program > execlp("date", "date", NULL); > // just in case exec fails > exit(0); > } > > MPI_Barrier(MPI_COMM_WORLD); > > if (myid == 0) > { > t_start=MPI_Wtime(); > for ( i=0; i< loop; i++ ) { > MPI_Isend(s_buf, size, MPI_CHAR, 1, 100, MPI_COMM_WORLD, > request+i); > } > > MPI_Waitall(loop, request, my_stat); > MPI_Recv(r_buf, 4, MPI_CHAR, 1, 101, MPI_COMM_WORLD, > &my_stat[0]); > > t_end=MPI_Wtime(); > t = t_end - t_start; > > }else{ > for ( i=0; i< loop; i++ ) { > MPI_Irecv(r_buf, size, MPI_CHAR, 0, 100, MPI_COMM_WORLD, > request+i); > } > MPI_Waitall(loop, request, my_stat); > MPI_Send(s_buf, 4, MPI_CHAR, 0, 101, MPI_COMM_WORLD); > } > > if ( myid == 0 ) { > double tmp; > tmp = ((size*1.0)/1.0e6)*loop; > fprintf(stdout,"%d\t%f\n", size, tmp/t); > } > { > int status; > int ret; > > ret = wait(&status); > if (ret == -1 || ! WIFEXITED(status) || WEXITSTATUS(status) != > 0) > { > fprintf(stdout,"ERROR: child failure: ret=%d, status=0x%x, > exit_status=%d\n", ret, status, WEXITSTATUS(status)); > } > } > > MPI_Barrier(MPI_COMM_WORLD); > MPI_Finalize(); > return 0; > } > > > -- > Michael Heinz > Principal Engineer, Qlogic Corporation > King of Prussia, Pennsylvania > > _______________________________________________ > mvapich-discuss mailing list > [EMAIL PROTECTED] > http://mail.cse.ohio-state.edu/mailman/listinfo/mvapich-discuss > _______________________________________________ general mailing list [email protected] http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
