Hi,

I think that I have found a bug on the implementation of GM collectives routines included in OpenMPI. The version of the GM software is 2.0.30 for the PCI64 cards. Sometimes, when I broadcast a vector with 1024 integer by using the MPI_Bcast call, some processor receives a bad packet. Usually, the difference with the original packet is only 1 bit, but it is enough to break the communication.
I obtain the same problems when I use the 1.4.1 or the 1.4.2 version.
Could you help me? Thanks.

Best regards,

  José I. Aliaga

=================================================
  COMPILATION COMMAND
=================================================
  mpicc test_comm.c -o test_comm

=================================================
  EXECUTION COMMAND
=================================================
mpirun --mca btl gm,sm,self -np 8 -machinefile mach_file test_comm 1 10 1000

=================================================
  SOME EXECUTION ERRORS
=================================================
## EXECUTION 1 ##
[2] receives 3039 when it must receive 7135. Its subtraction is 4096.
[2] receives 7142 when it must receive 7143. Its subtraction is 1.
[2,411th] (Bcast of 7 en 8) 1024 integers with 2 errors.

## EXECUTION 2 ##
[5] receives 7142 when it must receive 7136. Its subtraction is 6.
[5,277th] (Bcast of 0 en 8) 1024 integers with 1 errors.
[1] receives 7138 when it must receive 7140. Its subtraction is 2.
[1,385th] (Bcast of 4 en 8) 1024 integers with 1 errors.

## EXECUTION 3 ##
[5] receives 3038 when it must receive 7134. Its subtraction is 4096.
[5] receives 7141 when it must receive 7142. Its subtraction is 1.
[5,479th] (Bcast of 6 en 8) 1024 integers with 2 errors.

## EXECUTION 4 ##
[3] receives 3034 when it must receive 7130. Its subtraction is 4096.
[3] receives 7140 when it must receive 7138. Its subtraction is 2.
[3,539th] (Bcast of 2 en 8) 1024 integers with 2 errors.

## EXECUTION 5 ##
[5] receives 7135 when it must receive 3039. Its subtraction is 4096.
[5] receives 3046 when it must receive 3047. Its subtraction is 1.
[5,135th] (Bcast of 7 en 8) 1024 integers with 2 errors.

## EXECUTION 6 ##
[5] receives 7135 when it must receive 3039. Its subtraction is 4096.
[5] receives 3046 when it must receive 3047. Its subtraction is 1.
[5,246th] (Bcast of 7 en 8) 1024 integers with 2 errors.

## EXECUTION 7 ##
[2] receives 7128 when it must receive 3032. Its subtraction is 4096.
[2] receives 3047 when it must receive 3040. Its subtraction is 7.
[2,232th] (Bcast of 0 en 8) 1024 integers with 2 errors.

## EXECUTION 8 ##
[3] receives 3036 when it must receive 7132. Its subtraction is 4096.
[3] receives 7139 when it must receive 7140. Its subtraction is 1.
[3,344th] (Bcast of 4 en 8) 1024 integers with 2 errors.

=================================================
  SOURCE CODE --> test_comm.c
=================================================
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <mpi.h>

int *CreateVector (int tam) {
  int *ptr = NULL, my_id;

  MPI_Comm_rank(MPI_COMM_WORLD, &my_id);
  ptr = (int *) malloc (sizeof(int) * tam);
  if (ptr == NULL)
    { printf ("ERROR MEMORIA (%d)\n", my_id); exit(-1); }

  return ptr;
}

void InitVector (int *vec, int tam, int inic) {
  int i, val = inic, numprocs;

  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  if (inic >= 0)
    for (i= 0; i<tam; i++)
      { vec[i] = val; val += numprocs; }
  else
    for (i= 0; i<tam; i++) vec[i] = -1;
}

int CompareVector (int *vec, int tam, int inic) {
  int i, val = inic, numprocs, my_id, bool = 0;

MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank (MPI_COMM_WORLD, &my_id);
  for (i= 0; i<tam; i++) {
    if (vec[i] != val) {
printf ("[%d] receives %d when it must receive %d. Its subtraction is %d.\n", my_id, vec[i], val, ((val>vec[i])?(val-vec[i]): (vec[i]-val)));
      bool++;
    }
    val += numprocs;
  }

  return bool;
}

int main (int argc, char **argv) {
  int i, j, k, tam, num, bool;
  int pos1, pos2,  dim1, dim2, nexecs;
  int my_id, numprocs, prc_src, prc_dst;
  int *mess = NULL;
  int *mess1 = NULL, *mess2 = NULL;;
  char name[MPI_MAX_PROCESSOR_NAME];
  MPI_Request req;
  MPI_Status sta;

  MPI_Init (&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank (MPI_COMM_WORLD, &my_id);

  pos1 = atoi(argv[1]); dim1 = (1 << pos1);
  pos2 = atoi(argv[2]); dim2 = (1 << pos2);
  nexecs = atoi(argv[3]);

  MPI_Barrier(MPI_COMM_WORLD);

  mess = CreateVector(dim2); InitVector(mess, dim2, -1);
  for (tam=dim1; tam<=dim2; tam <<= 1) {
    for (k=0; k<nexecs; k++)
      for (i=0; i<numprocs; i++) {
        InitVector(mess, tam, ((my_id==i)?i:-1));
        MPI_Bcast(mess, tam, MPI_INT, i, MPI_COMM_WORLD);
        bool = CompareVector (mess, tam, i);
        if (bool > 0)
printf ("[%d,%dth] (Bcast of %d in %d) %d integers with %d errors\n",
                    my_id, k, i, numprocs, tam, bool);
      }
  }
  free (mess); mess = NULL;

  MPI_Barrier(MPI_COMM_WORLD);

  MPI_Finalize ();

  return 0;
}



Reply via email to