Hi All,

Update to the original posting: METHOD4 also resulted in a deadlock on system 
HPC2 after 5h of run with 32 MPI tasks; also, "const int scale=1;" was missing 
in the code snippet posted above.

--Evghenii

> Message: 2
> Date: Sun, 18 Sep 2011 02:06:33 +0000
> From: Evghenii Gaburov <e-gabu...@northwestern.edu>
> Subject: [OMPI users] custom sparse collective non-reproducible
>       deadlock, MPI_Sendrecv, MPI_Isend/MPI_Irecv or MPI_Send/MPI_Recv
>       question
> To: "us...@open-mpi.org" <us...@open-mpi.org>
> Message-ID: <8509050a-7357-408e-8d58-c5aefa7b3...@northwestern.edu>
> Content-Type: text/plain; charset="us-ascii"
> 
> Hi All,
> 
> My MPI program's basic task consists of regularly establishing point-to-point 
> communication with other procs via MPI_Alltoall, and then to communicate 
> data. I tested it on two HPC clusters with 32-256 MPI tasks. One of the 
> systems (HPC1) this custom collective runs flawlessly, while on another one 
> (HPC2) the collective causes non-reproducible deadlocks (after a day of 
> running, or after of few hours or so). So, I want to figure out whether it is 
> a system (HPC2) bug that I can communicate to HPC admins, or a subtle bug in 
> my code that needs to be fixed. One possibly important point, I communicate 
> huge amount of data between tasks (up to ~2GB of data) in several all2all 
> calls.
> 
> I would like to have expert eyes to look at the code to confirm or disprove 
> that the code is deadlock-safe. I have implemented several methods (METHOD1 - 
> METHOD4), that, if I am not mistaken, should in principle be deadlock safe. 
> However, as a beginner MPI user, I can easily miss something subtle, as such 
> I seek you help with this! I mostly used METHOD4 which have caused periodic 
> deadlock, after having deadlocks with METHOD1 and METHOD2. On HPC1 none these 
> methods deadlock in my runs. METHOD3 I am currently testing, so cannot 
> comment on it as yet but will later; however, I will be happy to hear your 
> comments.
> 
> Both system use openmpi-1.4.3.
> 
> Your answers will be of great help! Thanks!
> 
> Cheers,
> Evghenii
> 
> Here is the code snippet:
> 
>  template<class T>
>    void  all2all(std::vector<T> sbuf[], std::vector<T> rbuf[], 
>        const int myid, 
>        const int nproc)
>    {
>       static int nsend[NMAXPROC], nrecv[NMAXPROC];
>        for (int p = 0; p < nproc; p++) 
>          nsend[p] = sbuf[p].size();
>        MPI_Alltoall(nsend, 1, MPI_INT, nrecv, 1, MPI_INT, MPI_COMM_WORLD);  
> // let the other tasks know how much data they will receive from this one
> 
> #ifdef _METHOD1_
> 
>       static MPI_Status  stat[NMAXPROC  ];
>       static MPI_Request  req[NMAXPROC*2];
>       int nreq = 0;
>       for (int p = 0; p < nproc; p++) 
>         if (p != myid)
>         {
>           const int scount = nsend[p];
>           const int rcount = nrecv[p];
>           rbuf[p].resize(rcount);
>           if (scount > 0) MPI_Isend(&sbuf[p][0], nscount, datatype<T>(), p, 
> 1, MPI_COMM_WORLD, &req[nreq++]);
>           if (rcount > 0) MPI_Irecv(&rbuf[p][0], rcount,  datatype<T>(), p, 
> 1, MPI_COMM_WORLD, &req[nreq++]);
>         }
>       rbuf[myid] = sbuf[myid];
>       MPI_Waitall(nreq, req, stat);
> 
> #elif defined _METHOD2_
> 
>       static MPI_Status stat;
>       for (int p = 0; p < nproc; p++) 
>          if (p != myid)
>          {
>            const int scount = nsend[p]*scale;
>            const int rcount = nrecv[p]*scale;
>            rbuf[p].resize(rcount);
>            if (scount + rcount > 0)
>               MPI_Sendrecv(&sbuf[p][0], scount, datatype<T>(), p, 1,
>                            &rbuf[p][0], rcount, datatype<T>(), p, 1, 
> MPI_COMM_WORLD, &stat);
>           }
>            rbuf[myid] = sbuf[myid];
> 
> #elif defined _METHOD3_
> 
>       static MPI_Status  stat[NMAXPROC  ];
>       static MPI_Request  req[NMAXPROC*2];
>       for (int dist = 1; dist < nproc; dist++) 
>       {
>               const int src = (nproc + myid - dist) % nproc;
>               const int dst = (nproc + myid + dist) % nproc;
>               const int scount = nsend[dst]*scale;
>               const int rcount = nrecv[src]*scale;
>               rbuf[src].resize(rcount);
>               int nreq = 0;
>               if (scount > 0) MPI_Isend(&sbuf[dst][0], scount, datatype<T>(), 
> dst, 1, MPI_COMM_WORLD, &req[nreq++]);
>               if (rcount > 0) MPI_Irecv(&rbuf[src][0], rcount, datatype<T>(), 
> src, 1, MPI_COMM_WORLD, &req[nreq++]);
>               MPI_Waitall(nreq, req, stat);
>       }
>       rbuf[myid] = sbuf[myid];
> 
> #elif defined _METHOD4_
> 
>       static MPI_Status stat;
>       for (int dist = 1; dist < nproc; dist++) 
>       {
>               const int src = (nproc + myid - dist) % nproc;
>               const int dst = (nproc + myid + dist) % nproc;
>               const int scount = nsend[dst]*scale;
>               const int rcount = nrecv[src]*scale;
>               rbuf[src].resize(rcount);
>               if ((myid/dist) & 1)
>               {
>                       if (scount > 0) MPI_Send(&sbuf[dst][0], scount, 
> datatype<T>(), dst, 1, MPI_COMM_WORLD);
>                       if (rcount > 0) MPI_Recv(&rbuf[src][0], rcount, 
> datatype<T>(), src, 1, MPI_COMM_WORLD, &stat);
>               }
>               else
>               {
>                       if (rcount > 0) MPI_Recv(&rbuf[src][0], rcount, 
> datatype<T>(), src, 1, MPI_COMM_WORLD, &stat);
>                       if (scount > 0) MPI_Send(&sbuf[dst][0], scount, 
> datatype<T>(), dst, 1, MPI_COMM_WORLD);
>               }
>       }
>       rbuf[myid] = sbuf[myid];
> #endif
> }

Reply via email to