i am experiencing some issues w/ openmpi 1.2 running on a rocks
4.2.1cluster(the issues also appear to occur w/ openmpi
1.1.5 and 1.1.4).

when i run my program with the frontend in the list of nodes, they deadlock.

when i run my program without the frontend in the list of nodes, they run to
completion.

the simplest test program that does this(test1.c) does an "MPI_Init",
followed by an "MPI_Barrier", and a "MPI_Finalize".

so the following deadlocks:

   /users/gunter $ mpirun -np 3 -H frontend,compute-0-0,compute-0-1 ./test1
   host:compute-0-1.local made it past the barrier, ret:0
   mpirun: killing job...

   mpirun noticed that job rank 0 with PID 15384 on node frontend exited on
signal 15 (Terminated).
   2 additional processes aborted (not shown)

this runs to completion:

   /users/gunter $ mpirun -np 3 -H compute-0-0,compute-0-1,compute-0-2
./test1
   host:compute-0-1.local made it past the barrier, ret:0
   host:compute-0-0.local made it past the barrier, ret:0
   host:compute-0-2.local made it past the barrier, ret:0

if i have the compute nodes send a message to the frontend prior to the
barrier, it runs to completion:

   /users/gunter $ mpirun -np 3 -H frontend,compute-0-0,compute-0-1 ./test2
0
   host:     frontend.domain node:  0 is the master
   host:   compute-0-0.local node:  1 sent:  1 to:    0
   host:   compute-0-1.local node:  2 sent:  2 to:    0
   host:     frontend.domain node:  0 recv:  1 from:  1
   host:     frontend.domain node:  0 recv:  2 from:  2
   host:     frontend.domain made it past the barrier, ret:0
   host:   compute-0-1.local made it past the barrier, ret:0
   host:   compute-0-0.local made it past the barrier, ret:0

if i have a different node function as the master, it deadlocks:

   /users/gunter $ mpirun -np 3 -H frontend,compute-0-0,compute-0-1 ./test2
1
   host:   compute-0-0.local node:  1 is the master
   host:   compute-0-1.local node:  2 sent:  2 to:    1
   mpirun: killing job...

   mpirun noticed that job rank 0 with PID 15411 on node frontend exited on
signal 15 (Terminated).
   2 additional processes aborted (not shown)

how is it that in the first example, one node makes it past the barrier, and
the rest deadlock?

these programs both run to completion on two other MPI implementations.

is there something mis-configured on my cluster? or is this potentially an
openmpi bug?

what is the best way to debug this?

any help would be appreciated!

--tim
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <mpi.h>

int main(int c, char **v)
{
   int   ret;
   char *host = NULL;

   host  = (char *) calloc(128, sizeof(char));
   gethostname(host, 64);

   /* init mpi */
   ret = MPI_Init(&c, &v);
   assert(ret == MPI_SUCCESS);

   /* synchronize */
   ret = MPI_Barrier(MPI_COMM_WORLD);
   assert(ret == MPI_SUCCESS);

   printf("host:%s made it past the barrier, ret:%d\n", host, ret);
   fflush(stdout);

   /* exit */
   MPI_Finalize();

   free(host);

   return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <mpi.h>

int main(int argc, char **argv)
{
   int   ret;
   char *host = NULL;

   host  = (char *) calloc(128, sizeof(char));
   gethostname(host, 64);

   /* init mpi */
   ret = MPI_Init(&argc, &argv);
   assert(ret == MPI_SUCCESS);

   if(argc > 1)
   {
      int master = 0;
      int rank, size;

      master = atoi(argv[1]);

      MPI_Comm_rank(MPI_COMM_WORLD, &rank);
      MPI_Comm_size(MPI_COMM_WORLD, &size);

      if(rank != master)
      {
         MPI_Send(&rank, 1, MPI_INT, master, rank, MPI_COMM_WORLD);
         printf("host:%20s node:%3d sent:%3d to:  %3d\n", host, rank, rank, 
master);
         fflush(stdout);
      }
      /* is master node */
      else
      {
         int i;
         int val;
         MPI_Status status;

         printf("host:%20s node:%3d is the master\n", host, rank);
         fflush(stdout);

         for(i = 0; i < size; ++i)
         {
            if(i == master) continue;
            MPI_Recv(&val, 1, MPI_INT, i, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
            printf("host:%20s node:%3d recv:%3d from:%3d\n", host, rank, val, 
status.MPI_SOURCE);
            fflush(stdout);
         }
      }
   }

   /* synchronize */
   ret = MPI_Barrier(MPI_COMM_WORLD);
   assert(ret == MPI_SUCCESS);

   printf("host:%20s made it past the barrier, ret:%d\n", host, ret);
   fflush(stdout);

   /* exit */
   MPI_Finalize();

   free(host);

   return 0;
}

Reply via email to